Skip to content

Commit

Permalink
docs: add comments in DocxLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
Yagnap committed Jul 10, 2024
1 parent dbde592 commit 251df6c
Showing 1 changed file with 29 additions and 23 deletions.
52 changes: 29 additions & 23 deletions src/main/java/eu/snik/tag/DocxLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@
/** Extracts SNIK classes from a tagged DOCX file. */
public class DocxLoader extends Loader {

// private static ObjectFactory factory = Context.getWmlObjectFactory();
// private static int commentId = 10000;

/**
* Creates a new instance for the DOCX loader to load one DOCX file
* @param in Input stream for the DOCX file to load
Expand All @@ -35,11 +32,18 @@ public DocxLoader(InputStream in) throws IOException {
super(in);
}

/** from https://stackoverflow.com/questions/19676282/docx4j-find-and-replace */
static List<Object> getAllElementsFromObject(Object obj, Class<?>... toSearch) {
/**
* Search for any occurences of Docx4J instances from any given node in the document tree.
* Originally from <a href="http://www.smartjava.org/content/create-complex-word-docx-documents-programatically-docx4j/">this blog post</a>.
* @param obj object to search in
* @param toSearch Classes to search for
* @return List of all occurrences of instances of the given classes as self, children or transitive children of the given object.
*/
private static List<Object> getAllElementsFromObject(Object obj, Class<?>... toSearch) {
List<Object> result = new ArrayList<Object>();
if (obj instanceof JAXBElement) obj = ((JAXBElement<?>) obj).getValue();

// only add object to found after processing its children
if (obj instanceof ContentAccessor) {
List<?> children = ((ContentAccessor) obj).getContent();
for (Object child : children) {
Expand All @@ -54,7 +58,10 @@ static List<Object> getAllElementsFromObject(Object obj, Class<?>... toSearch) {
return result;
}

/** @return the complete text from the DOCX file without any formatting */
/**
* Get the entire unformatted textual content of the document.
* @return the complete text from the DOCX file without any formatting (except line breaks)
*/
@Override
public String getText() {
try {
Expand All @@ -63,7 +70,10 @@ public String getText() {
var doc = wordMLPackage.getMainDocumentPart();
var parts = new ArrayList<String>();

List<Object> texts = getAllElementsFromObject(doc, org.docx4j.wml.Text.class, org.docx4j.wml.P.class);
// extract all text passages (including paragraph objects for information on line breaks)
List<Object> texts = DocxLoader.getAllElementsFromObject(doc, org.docx4j.wml.Text.class, org.docx4j.wml.P.class);

// convert org.docx4j.wml.Text-s to Strings (interpret paragraphs as line breaks)
for (Object t : texts) {
if(t instanceof org.docx4j.wml.P) {
parts.add("\n\n");
Expand All @@ -72,6 +82,8 @@ public String getText() {
parts.add(content.getValue());
}
}

// put the parts together
return parts
.stream()
.reduce(
Expand Down Expand Up @@ -100,9 +112,15 @@ public String getText() {
}
}

record TagClass(String tag, String description, Subtop subtop) {}
/**
* Local type used for quickly identifying tagged tokens.
*/
private record TagClass(String tag, String description, Subtop subtop) {}

/** @return all classes extracted from the tagged parts of the DOCX document*/
/**
* Extract all classes marked in the Docx document, without any duplicates.
* @return all classes extracted from the tagged parts of the DOCX document
*/
@Override
public Collection<Clazz> getClasses() {
try {
Expand Down Expand Up @@ -142,20 +160,9 @@ public Collection<Clazz> getClasses() {
continue;
} // abbreviations
processedRuns.add(run);

/*
Comment comment = factory.createCommentsComment();
comments.add(comment);
comment.setId(BigInteger.valueOf(++commentId));
Text commentText = factory.createText();
commentText.setValue("this is a comment for "+label);
comment.getContent().add(commentText);
CommentReference commentRef = factory.createRCommentReference();
run.getContent().add(commentRef);
commentRef.setId(BigInteger.valueOf(commentId));
*/

// remove multiply annotated tokens, then add the rest to processedLabels
Clazz clazz = new Clazz(label, labelToLocalName(label), tc.subtop);
//System.out.println(text+" "+ clazz);
if (processedLabels.contains(label)) {
classes
.stream()
Expand Down Expand Up @@ -184,7 +191,6 @@ public Collection<Clazz> getClasses() {
}
}

//warningCallback.ifPresent(c->c.accept(warnings.stream().reduce("", (a,b)->a+"\n"+b)));
System.out.println(classes.size() + " classes extracted.");

return classes;
Expand Down

0 comments on commit 251df6c

Please sign in to comment.