How to get all Text Element of a Paragraph with docx4j
Posted: Wed Oct 29, 2014 7:30 pm
Hi,
I just trying to create a Map which key is a Paragraph and value the List of Text Element inside him. I use this map for count how many instance of a word is inside a document part. I need to do this because the document is not a plain docx document, but a a document is embed another document (like an "include"). For this reason i can't read all the plain text of the document beacuse.
So i've implemented this aloghoritm to traverse the document and create this map. this is the code.
I've created this on the basis of the analisys of some document, but i don't know if i've forgotten some docx4j element, i would like to make this algorithm as general as possible to works fine on all document that i will have? Any suggestion?
Also, is there any other smart procedure to do that, for example with traveralUtils? Any example?
Thanks
I just trying to create a Map which key is a Paragraph and value the List of Text Element inside him. I use this map for count how many instance of a word is inside a document part. I need to do this because the document is not a plain docx document, but a a document is embed another document (like an "include"). For this reason i can't read all the plain text of the document beacuse.
So i've implemented this aloghoritm to traverse the document and create this map. this is the code.
- Code: Select all
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.bind.JAXBElement;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.SdtBlock;
import org.docx4j.wml.SdtPr;
import org.docx4j.wml.SdtRun;
import org.docx4j.wml.Text;
public class FindWordAndReplaceTest {
private String toFind;
private boolean startAgain;
public FindWordAndReplaceTest(String toFind){
this.toFind = toFind;
}
public int wordOccurances(File file) throws Docx4JException{
WordprocessingMLPackage wmlPackage = WordprocessingMLPackage.load(file);
return findWord(wmlPackage, toFind);
}
private int findWord(WordprocessingMLPackage doc, String toFind){
HashMap<ContentAccessor, List<Text>> caMap = new HashMap<ContentAccessor, List<Text>>();
List<Object> bodyChildren = doc.getMainDocumentPart().getContent();
for (Object child : bodyChildren) {
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if(child instanceof SdtBlock){
SdtBlock stdBlock = (SdtBlock)child;
if(!checkIfInclude(stdBlock.getSdtPr())){
do {
startAgain = false;
for (Object o : stdBlock.getSdtContent().getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if (o instanceof SdtBlock ){
stdBlock = (SdtBlock)o;
startAgain = true;
break;
}
else if ( o instanceof ContentAccessor ) {
ContentAccessor caElement = (ContentAccessor) o;
if (o instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement,caMap));
}else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
}
}
while (startAgain);
}
}
else if(child instanceof ContentAccessor){
ContentAccessor caElement = (ContentAccessor) child;
if (child instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
} else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
}
// i've the map paragraph -- textList
int wordOcc = 0;
for (ContentAccessor ca : caMap.keySet()){
if (!caMap.get(ca).isEmpty()){
StringBuilder builder = new StringBuilder();
for (Text text : caMap.get(ca)){
builder.append(text.getValue());
}
wordOcc += numOfOccourences(builder, toFind);
}
}
return wordOcc;
}
private int numOfOccourences(StringBuilder builder, String toFind){
String[][] tasks =
{
{"^t", "\t"},
{"^=", "\u2013"},
{"^+", "\u2014"},
{"^s", "\u00A0"},
{"^?", "."},
{"^#", "\\d"},
{"^$", "\\p{L}"}
};
for (String[] replacement : tasks)
toFind = toFind.replace(replacement[0], replacement[1]);
Pattern p = Pattern.compile(toFind, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(builder.toString());
int count = 0;
while (m.find()){
count +=1;
}
return count;
}
/*
* check if it is a include object
*
*/
private boolean checkIfInclude(SdtPr sdtPr){
for(Object child : sdtPr.getRPrOrAliasOrLock()){
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if(child instanceof SdtPr.Alias){
SdtPr.Alias alias = (SdtPr.Alias) child;
if(alias.getVal().contains(("Include :"))){
return true;
}
else
return false;
}
}
return false;
}
private List<Text> getAllTextfromContenAccessor(ContentAccessor ca, HashMap<ContentAccessor, List<Text>> caMap){
List<Text> textList = new ArrayList<Text>();
List<Object> children = ca.getContent();
for (Object child : children){
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if (child instanceof Text ){
Text text = (Text) child;
textList.add(text);
} else if (child instanceof R){
R run = (R)child;
for (Object o : run.getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if(o instanceof R.Tab){
Text text = new Text();
text.setValue("\t");
textList.add(text);
}
if(o instanceof R.SoftHyphen){
Text text = new Text();
text.setValue("\u00AD");
textList.add(text);
}
if(o instanceof Text){
textList.add((Text)o);
}
}
}
else if (child instanceof ContentAccessor){
ContentAccessor caElement = (ContentAccessor) child;
if (child instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
}else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
else if(child instanceof SdtRun){
SdtRun sdtRun = (SdtRun)child;
getAllTextFromSdtRun(sdtRun, textList,caMap);
}
}
return textList;
}
public List<Text> getAllTextFromSdtRun(SdtRun sdtRun, List<Text> textList, HashMap<ContentAccessor, List<Text>> caMap){
if(!checkIfInclude(sdtRun.getSdtPr())){
for (Object o : sdtRun.getSdtContent().getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if (o instanceof R){
R run = (R)o;
for (Object ob : run.getContent()){
if (ob instanceof JAXBElement) ob = ((JAXBElement<?>) ob).getValue();
if(o instanceof R.Tab){
Text text = new Text();
text.setValue("\t");
textList.add(text);
}
if(o instanceof R.SoftHyphen){
Text text = new Text();
text.setValue("\u00AD");
textList.add(text);
}
if(ob instanceof Text){
textList.add((Text)ob);
}
}
}
else if ( o instanceof ContentAccessor ) {
ContentAccessor caElement = (ContentAccessor) o;
if (o instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
}else {
textList.addAll(getAllTextfromContenAccessor(caElement, caMap));
}
}
}
}
return textList;
}
public static void main(String[] args) {
String filePath = System.getProperty("user.home") + "myDoc.docx";
FindWordAndReplaceTest th = new FindWordAndReplaceTest("NORTH");
try {
System.out.println(th.wordOccurances(new java.io.File(filePath)));
} catch (Docx4JException e) {
e.printStackTrace();
}
}
}
I've created this on the basis of the analisys of some document, but i don't know if i've forgotten some docx4j element, i would like to make this algorithm as general as possible to works fine on all document that i will have? Any suggestion?
Also, is there any other smart procedure to do that, for example with traveralUtils? Any example?
Thanks