Skip to content

Commit

Permalink
Fixed attributes parsing in HTML and improved JSON filter
Browse files Browse the repository at this point in the history
  • Loading branch information
rmraya committed Aug 29, 2024
1 parent 7719a24 commit 955766e
Show file tree
Hide file tree
Showing 11 changed files with 594 additions and 19 deletions.
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
</target>
<target name="link" depends="distclean,compile">
<description>Build Java binaries</description>
<link destDir="dist" modulepath="lib:${java.home}/jmods">
<link destDir="dist" modulepath="lib:${java.home}/jmods" includeManPages="false">
<module name="openxliff" />
</link>
<delete file="dist/lib/jrt-fs.jar" />
Expand Down
534 changes: 534 additions & 0 deletions catalog/tbx/TBX_core.xsd

Large diffs are not rendered by default.

Binary file modified lib/openxliff.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion sonar-project.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
sonar.projectKey=OpenXLIFF
# this is the name displayed in the SonarQube UI
sonar.projectName=OpenXLIFF
sonar.projectVersion=3.20.0
sonar.projectVersion=3.21.0

# Path is relative to the sonar-project.properties file. Replace "\" by "/" on Windows.
# Since SonarQube 4.2, this property is optional if sonar.modules is set.
Expand Down
4 changes: 2 additions & 2 deletions src/com/maxprograms/converters/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ private Constants() {

public static final String TOOLID = "OpenXLIFF";
public static final String TOOLNAME = "OpenXLIFF Filters";
public static final String VERSION = "3.20.0";
public static final String BUILD = "202403612_0703";
public static final String VERSION = "3.21.0";
public static final String BUILD = "20240829_1846";

public static final String SUCCESS = "0";
public static final String ERROR = "1";
Expand Down
4 changes: 2 additions & 2 deletions src/com/maxprograms/converters/html/Html2Xliff.java
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public static List<String> run(Map<String, String> params) {
output.close();
}
result.add(Constants.SUCCESS);
} catch (IOException | SAXException | ParserConfigurationException | URISyntaxException e) {
} catch (IOException | SAXException | ParserConfigurationException | URISyntaxException | StringIndexOutOfBoundsException e) {
Logger logger = System.getLogger(Html2Xliff.class.getName());
logger.log(Level.ERROR, Messages.getString("Html2Xliff.2"), e);
result.add(Constants.ERROR);
Expand Down Expand Up @@ -812,7 +812,7 @@ private static Map<String, Attribute> attributesMap(String element) {
}
if (token.startsWith("\"") || token.startsWith("'")) {
String quote = token.substring(0, 1);
if (token.endsWith(quote)) {
if (token.endsWith(quote) && token.length() > 1) {
// value is one word
atts.put(key, new Attribute(key, token.substring(1, token.length() - 1)));
} else {
Expand Down
10 changes: 5 additions & 5 deletions src/com/maxprograms/converters/json/ElementBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ private ElementBuilder() {
// private for security
}

public static ElementHolder buildElement(String name, String string, boolean trimTags) {
public static ElementHolder buildElement(String name, String string, boolean trimTags, boolean mergeTags) {
Element element = new Element(name);
element.setText(string);
fixHtmlTags(element);
fixHtmlTags(element, mergeTags);
String start = "";
String end = "";
if (!element.getChildren().isEmpty()) {
Expand Down Expand Up @@ -71,9 +71,9 @@ public static ElementHolder buildElement(String name, String string, boolean tri
return new ElementHolder(element, start, end);
}

private static void fixHtmlTags(Element src) {
private static void fixHtmlTags(Element src, boolean mergeTags) {
if (pattern == null) {
pattern = Pattern.compile("<[A-Za-z0-9]+([\\s][A-Za-z\\-\\.]+=[\"|\'][^<&>]*[\"|\'])*[\\s]*/?>");
pattern = Pattern.compile("<[A-Za-z0-9]+([\\s]+[A-Za-z\\-\\.]+=[\"|\'][^<&>]*[\"|\'])*[\\s]*/?>");
}
if (endPattern == null) {
endPattern = Pattern.compile("</[A-Za-z0-9]+>");
Expand Down Expand Up @@ -170,7 +170,7 @@ private static void fixHtmlTags(Element src) {
}
src.setContent(newContent);
}
if (src.getChildren().size() > 1) {
if (mergeTags && src.getChildren().size() > 1) {
mergeTags(src);
}
}
Expand Down
22 changes: 16 additions & 6 deletions src/com/maxprograms/converters/json/Json2Xliff.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ public class Json2Xliff {
private static int bomLength = 0;
private static List<String[]> entities;
private static boolean trimTags;
private static boolean mergeTags;
private static boolean rawSegmentation;

private Json2Xliff() {
// do not instantiate this class
Expand All @@ -76,6 +78,8 @@ public static List<String> run(Map<String, String> params) {
segments = new ArrayList<>();
ids = new HashSet<>();
trimTags = true;
mergeTags = true;
rawSegmentation = false;
boolean exportHTML = false;
entities = new ArrayList<>();

Expand Down Expand Up @@ -107,6 +111,8 @@ public static List<String> run(Map<String, String> params) {
if (configFile != null) {
JsonConfig config = JsonConfig.parseFile(configFile);
trimTags = config.getTrimTags();
mergeTags = config.getMergeTags();
rawSegmentation = config.getRawSegmentation();
exportHTML = config.getExportHTML();
if (config.getParseEntities()) {
entities = loadEntities(catalog);
Expand Down Expand Up @@ -361,11 +367,13 @@ private static void parseJson(JSONObject json, JsonConfig config) throws IOExcep

String[] sourceSegments = new String[] { sourceText };
if (segmenter != null) {
sourceSegments = segmenter.segment(sourceText);
sourceSegments = rawSegmentation ? segmenter.segmentRawString(sourceText)
: segmenter.segment(sourceText);
}
String[] targetSegments = new String[] {};
if (!tgtLang.isEmpty() && !targetText.isEmpty() && targetSegmenter != null) {
targetSegments = targetSegmenter.segment(targetText);
targetSegments = rawSegmentation ? segmenter.segmentRawString(targetText)
: targetSegmenter.segment(targetText);
if (targetSegments.length != sourceSegments.length) {
sourceSegments = new String[] { sourceText };
targetSegments = new String[] { targetText };
Expand All @@ -385,7 +393,8 @@ private static void parseJson(JSONObject json, JsonConfig config) throws IOExcep
}
ids.add(transUnit.getAttributeValue("id"));
transUnit.addContent("\n ");
ElementHolder sourceHolder = ElementBuilder.buildElement("source", sourceSegments[h], trimTags);
ElementHolder sourceHolder = ElementBuilder.buildElement("source", sourceSegments[h], trimTags,
mergeTags);
Element source = sortTags(sourceHolder.getElement());
transUnit.addContent(source);
if (transUnit.getChild("source").getChildren().isEmpty()) {
Expand All @@ -400,7 +409,8 @@ private static void parseJson(JSONObject json, JsonConfig config) throws IOExcep
sb.append(sourceHolder.getEnd());
json.put(sourceKey, sb.toString());
} else {
ElementHolder targetHolder = ElementBuilder.buildElement("target", targetSegments[h], trimTags);
ElementHolder targetHolder = ElementBuilder.buildElement("target", targetSegments[h], trimTags,
mergeTags);
Element target = matchTags(source, targetHolder.getElement());
transUnit.addContent("\n ");
transUnit.addContent(target);
Expand Down Expand Up @@ -583,7 +593,7 @@ private static List<String> harvestNotes(Object object) {

private static String parseText(String string) {
if (!paragraphSegmentation) {
String[] segs = segmenter.segment(string);
String[] segs = rawSegmentation ? segmenter.segmentRawString(string) : segmenter.segment(string);
StringBuilder result = new StringBuilder();
for (int i = 0; i < segs.length; i++) {
result.append(addSegment(segs[i]));
Expand All @@ -597,7 +607,7 @@ private static String addSegment(String string) {
Element segment = new Element("trans-unit");
segment.setAttribute("id", "" + id);
segment.addContent("\n ");
ElementHolder holder = ElementBuilder.buildElement("source", string, trimTags);
ElementHolder holder = ElementBuilder.buildElement("source", string, trimTags, mergeTags);
segment.addContent(holder.getElement());
segment.addContent("\n ");
if (holder.getElement().getChildren().isEmpty()) {
Expand Down
18 changes: 18 additions & 0 deletions src/com/maxprograms/converters/json/JsonConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public class JsonConfig {
private List<String> sourceKeys;
private boolean parseEntities;
private boolean trimTags;
private boolean mergeTags;
private boolean rawSegmentation;
private boolean exportHTML;

private JsonConfig() {
Expand All @@ -46,6 +48,8 @@ private JsonConfig() {
sourceKeys = new Vector<>();
parseEntities = false;
trimTags = true;
mergeTags = true;
rawSegmentation = false;
exportHTML = true;
}

Expand Down Expand Up @@ -85,6 +89,12 @@ public static JsonConfig parseFile(String configFile) throws IOException, JSONEx
if (configObject.has("trimTags")) {
config.trimTags = configObject.getBoolean("trimTags");
}
if (configObject.has("mergeTags")) {
config.mergeTags = configObject.getBoolean("mergeTags");
}
if (configObject.has("rawSegmentation")) {
config.rawSegmentation = configObject.getBoolean("rawSegmentation");
}
if (configObject.has("exportHTML")) {
config.exportHTML = configObject.getBoolean("exportHTML");
}
Expand All @@ -111,6 +121,14 @@ public boolean getTrimTags() {
return trimTags;
}

public boolean getMergeTags() {
return mergeTags;
}

public boolean getRawSegmentation() {
return rawSegmentation;
}

public boolean getExportHTML() {
return exportHTML;
}
Expand Down
6 changes: 5 additions & 1 deletion src/com/maxprograms/converters/json/README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# JSON Filter Configuration Files

Configuration files for JSON filter are defined in a JSON file that contains two arrays and three optional boolean keys:
Configuration files for JSON filter are defined in a JSON file that contains two arrays and five optional boolean keys:

- `traslatable`: array of JSON objects that define translatable keys
- `ignorable`: array of strings listing ignorable keys
- `parseEntities`: boolean value indicating whether HTML entitites should be converted to Unicode characters. Default: `false`
- `trimTags`: send initial/trailing tags to skeleton when possible. Default: `true`
- `mergeTags`: merge adjacent tags. Default: `true`
- `rawSegmentation`: treat source and target as plain text for segmentation. Default: `false`
- `exportHTML`: treat target as containg HTML on merge. Default: `false`

Configuration files must be written using UTF-8 character set without a byte order mark (BOM).
Expand Down Expand Up @@ -61,6 +63,8 @@ Configuration files must be written using UTF-8 character set without a byte ord
],
"parseEntities": true,
"trimTags": false,
"mergeTags": false,
"rawSegmentation": true,
"exportHTML" : true
}
```
Expand Down
11 changes: 10 additions & 1 deletion src/com/maxprograms/segmenter/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public Segmenter(String srxFile, String srcLanguage, Catalog catalog)
if (!root.getAttributeValue("version").equals("2.0")) {
throw new IOException(Messages.getString("Segmenter.2"));
}
tags = new HashMap<>();
cascade = isCascading();
buildRulesList(srcLanguage);
}
Expand All @@ -71,11 +72,19 @@ public Segmenter(Document doc, String srcLanguage) throws IOException {
buildRulesList(srcLanguage);
}

public String[] segmentRawString(String string) {
return segment(string, false);
}

public String[] segment(String string) {
return segment(string, true);
}

private String[] segment(String string, boolean prepare) {
if (string == null || string.isEmpty()) {
return new String[] {};
}
String pureText = prepareString(string);
String pureText = prepare ? prepareString(string) : string;
List<String> parts = new ArrayList<>();
for (int pos = 0; pos < pureText.length(); pos++) {
String left = hideTags(pureText.substring(0, pos));
Expand Down

0 comments on commit 955766e

Please sign in to comment.