Page 1 of 1

Converting DOCX to HTML, using IntelliJ and Maven

PostPosted: Wed Apr 14, 2021 6:31 am
by VMSZealot
Cards on the table first of all - I'm not a Java developer, I'm a baffled C developer pretending to know what I'm doing - so even if it seems obvious to you, it won't be to me.

I have successfully used poi to make a DOC to HTML converter - and now I need a DOCX to HTML converter, with a decent degree of accuracy in the formatting. I think that docx4j might be just the tool I'm looking for if I can get it to work.

My Maven looks like this:

Code: Select all
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>DocumentConverter</artifactId>
    <version>1.0-SNAPSHOT</version>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>15</source>
                    <target>15</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <properties>
        <maven.compiler.source>16</maven.compiler.source>
        <maven.compiler.target>16</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
            <version>8.2.8</version>
            <!--  <scope>test</scope>-->
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.5</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.8.0</version>
        </dependency>

        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-classic</artifactId>
            <version>1.2.3</version>
        </dependency>

        <!--  logging config files; docx4j.properties -->
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j</artifactId>
            <version>6.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.eclipse.persistence</groupId>
            <artifactId>org.eclipse.persistence.moxy</artifactId>
            <version>2.5.1</version>
        </dependency>
       
    </dependencies>

</project>


My code looks like this - as you can see, I've pared it right back to the essentials to see if I can find the problem…

Code: Select all
import org.docx4j.Docx4J;
import org.docx4j.Docx4jProperties;
import org.docx4j.convert.out.HTMLSettings;
import org.docx4j.convert.out.html.AbstractHtmlExporter;
import org.docx4j.convert.out.html.HtmlExporterNG2;
import org.docx4j.model.fields.FieldUpdater;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.org.apache.poi.util.IOUtils;


import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import java.io.*;

public class DocXTest {

    public static void docxToHTML(String filename, String outputname) throws FileNotFoundException, Docx4JException {

        File file = new File(filename);
        FileInputStream inputStream = new FileInputStream(file);
  //      WordprocessingMLPackage pkg = WordprocessingMLPackage.load(inputStream);

        WordprocessingMLPackage wordMLPackage = Docx4J.load(new java.io.File(filename));


//        String root = System.getProperty("user.dir");
//
//        // Refresh the values of DOCPROPERTY fields
//        FieldUpdater updater = new FieldUpdater(pkg);
//        updater.update(true);
//
//        AbstractHtmlExporter exporter = new HtmlExporterNG2();
//        HTMLSettings htmlSettings = Docx4J.createHTMLSettings();
//        htmlSettings.setImageDirPath(root +"/tmp/sample-docx.html_files");
//        htmlSettings.setImageTargetUri(root +"/tmp/_files");
//        htmlSettings.setWmlPackage(pkg);
//
//        Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);
//
//        OutputStream os = new FileOutputStream(root + "/" + outputname + ".html");
//        Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);
//        IOUtils.closeQuietly(os);
//
//        if (pkg.getMainDocumentPart().getFontTablePart() != null) {
//            pkg.getMainDocumentPart().getFontTablePart()
//                    .deleteEmbeddedFontTempFiles();
//        }
//        // This would also do it, via finalize() methods
//        htmlSettings = null;
//        pkg = null;
    }

    public static void main(String[] args) throws Exception {
        String parameter;
        if (args.length < 1) {
            System.out.println("No parameter supplied");
            System.exit(1);
        }
        parameter = args[0];

        docxToHTML(parameter, "testm");
    }
}


Even with so minimal an implementation, with core heavy lifting removed, I get the following errors.

Code: Select all
log4j:WARN No appenders could be found for logger (org.docx4j.utils.ResourceUtils).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" org.docx4j.openpackaging.exceptions.Docx4JException: Couldn't get [Content_Types].xml from ZipFile
   at org.docx4j.openpackaging.io3.Load3.get(Load3.java:148)
   at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:561)
   at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:410)
   at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:287)
   at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:265)
   at org.docx4j.openpackaging.packages.WordprocessingMLPackage.load(WordprocessingMLPackage.java:168)
   at org.docx4j.Docx4J.load(Docx4J.java:233)
   at DocXTest.docxToHTML(DocXTest.java:26)
   at DocXTest.main(DocXTest.java:88)
Caused by: org.docx4j.openpackaging.exceptions.InvalidFormatException: Bad [Content_Types].xml
   at org.docx4j.openpackaging.contenttype.ContentTypeManager.parseContentTypesFile(ContentTypeManager.java:871)
   at org.docx4j.openpackaging.io3.Load3.get(Load3.java:146)
   ... 8 more
Caused by: java.lang.RuntimeException: javax.xml.bind.JAXBException: JAXB: Can't instantiate JAXB Reference Implementation
- with linked exception:
[java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper]
   at org.docx4j.XmlUtils.marshaltoString(XmlUtils.java:901)
   at org.docx4j.openpackaging.contenttype.ContentTypeManager.parseContentTypesFile(ContentTypeManager.java:851)
   ... 9 more
Caused by: javax.xml.bind.JAXBException: JAXB: Can't instantiate JAXB Reference Implementation
- with linked exception:
[java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper]
   at org.docx4j.jaxb.NamespacePrefixMapperUtils.tryUsingRI(NamespacePrefixMapperUtils.java:95)
   at org.docx4j.jaxb.NamespacePrefixMapperUtils.getPrefixMapper(NamespacePrefixMapperUtils.java:71)
   at org.docx4j.XmlUtils.marshaltoString(XmlUtils.java:850)
   ... 10 more
Caused by: java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper
   at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:636)
   at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:182)
   at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:519)
   at java.base/java.lang.Class.forName0(Native Method)
   at java.base/java.lang.Class.forName(Class.java:375)
   at org.docx4j.jaxb.NamespacePrefixMapperUtils.tryUsingRI(NamespacePrefixMapperUtils.java:79)
   ... 12 more

Process finished with exit code 1


I hope it's obvious, but the web has not been helpful so far. I don't think that I missed anything from https://docx4java.org/docx4j/Docx4j_GettingStarted.pdf

What have I missed?

Re: Converting DOCX to HTML, using IntelliJ and Maven

PostPosted: Wed Apr 14, 2021 8:53 am
by jason
What Java are you using? java -version

Re: Converting DOCX to HTML, using IntelliJ and Maven

PostPosted: Wed Apr 14, 2021 6:13 pm
by VMSZealot
I'm using the version included with IntelliJ Community Edition 2020.3 - and it says Runtime 11.0.9.1 in the about Menu. I can't find anywhere that this might be overridden in my project.

Re: Converting DOCX to HTML, using IntelliJ and Maven

PostPosted: Thu Apr 15, 2021 7:18 am
by jason
Remove from your deps:

Code: Select all
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j</artifactId>
            <version>6.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.eclipse.persistence</groupId>
            <artifactId>org.eclipse.persistence.moxy</artifactId>
            <version>2.5.1</version>
        </dependency>


Refer further https://github.com/plutext/docx4j/blob/ ... 4j/pom.xml

Re: Converting DOCX to HTML, using IntelliJ and Maven

PostPosted: Thu Apr 15, 2021 8:42 am
by VMSZealot
Thank you so much - now onto the next problem! That worked though.