Page 1 of 1

HTML not containing the list styles

PostPosted: Mon May 08, 2017 3:20 pm
by thirub04
Hi Jason,

I am using doc4j to convert docx to html.
The output html does not retain the list styles such as bulleted, Roman or decimal

Is there any way to fix them ?
Or is it the default behavior ?

Thanks

Re: HTML not containing the list styles

PostPosted: Tue May 09, 2017 8:47 am
by jason
Your Java code and sample docx exhibiting the issue?

Re: HTML not containing the list styles

PostPosted: Tue May 09, 2017 7:23 pm
by thirub04
Hi Jason,

Attaching my code and docx file.


1134.docx
Document File
(62.06 KiB) Downloaded 251 times




public class DocxToHTMLParser {

public static void main(String args[]) throws IOException, SAXException, ParserConfigurationException {
DocxToHTMLParser docx = new DocxToHTMLParser();
docx.parseDocxToHTML();
}

public File parseDocxToHTML() {
WordprocessingMLPackage wmlPackage = null;
HTMLSettings settings = Docx4J.createHTMLSettings();
String inputFilePath = System.getProperty("user.dir");
System.out.println("Input file path is : " + inputFilePath);
File home = new File(inputFilePath);
File newHtmlFile = null;

try {
for (File file : home.listFiles()) {
String fileName = file.getName();
String ext = FileUtil.getFileNameExtension(fileName);
if (ext.equals(".docx")) {
wmlPackage = Docx4J.load(new File(fileName));
settings.setImageDirPath(inputFilePath + "_files");
settings.setImageTargetUri(inputFilePath.substring(inputFilePath.lastIndexOf("/") + 1) + "_files");
settings.setWmlPackage(wmlPackage);
String val = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img, ol, ul, li, table, caption, tbody, tfoot, thead, tr, th, td "
+ "{ margin: 0; padding: 0; border: 0;}" + "body {line-height: 1;} ";
;
settings.setUserCSS(val);

OutputStream outputStream = new FileOutputStream(
new File(inputFilePath + "/" + "output" + ".html"));

Docx4J.toHTML(settings, outputStream, Docx4J.FLAG_EXPORT_PREFER_XSL);

String input = FileUtils.readFileToString(new File(inputFilePath + "/" + "output" + ".html"),
"UTF-8");
Document doc = Jsoup.parse(input);
Elements elements = doc.body().children();
StringBuilder redefined = new StringBuilder();
String attributeValue = null;
int index = 1;
for (Element element : elements) {
if ((!element.tagName().equals("span")) &&( element.tagName().equals("div")
|| element.tagName().equals("p") || element.tagName().equals("li")
|| element.tagName().equals("table") || element.tagName().equals("td")
|| element.tagName().equals("h1") || element.tagName().equals("h2")
|| element.tagName().equals("h2") || element.tagName().equals("h3")
|| element.tagName().equals("h4") || element.tagName().equals("h5")
|| element.tagName().equals("h6"))) {
attributeValue = "" + index;
element.attr("id", attributeValue);
index++;
Elements childElement = element.children().select("*");
for (Element children : childElement) {
if ((!children.tagName().equals("span")) &&(children.tagName().equals("div")
|| children.tagName().equals("p") || children.tagName().equals("li")
|| children.tagName().equals("table") || children.tagName().equals("td")
|| children.tagName().equals("h1") || children.tagName().equals("h2")
|| children.tagName().equals("h2") || children.tagName().equals("h3")
|| children.tagName().equals("h4") || children.tagName().equals("h5")
|| children.tagName().equals("h6"))) {
attributeValue = "" + index;
children.attr("id", attributeValue);
index++;
}
}
redefined.append(element.toString());
}
}
newHtmlFile = new File(inputFilePath + "/" + "outputWithID" + ".html");
FileUtils.writeStringToFile(newHtmlFile, redefined.toString(), false);
}
}
} catch (Docx4JException | IOException e) {
e.printStackTrace();
}
return newHtmlFile;
}

Re: HTML not containing the list styles

PostPosted: Fri May 12, 2017 9:02 pm
by jason
You can try:

Code: Select all
htmlSettings.getFeatures().remove(ConversionFeatures.PP_HTML_COLLECT_LISTS);