Page 1 of 1

docx to pdf conversion problem

PostPosted: Wed Oct 13, 2010 10:57 pm
by sfeher
Hi,

I would like to convert files from docx format to pdf. I made a pilot project to test it.
The source docx is p1 (I replace things and write sample1.docx with the new content).

This is what I have:

-jdk-1.6 update 21
-docx4j-nightly-20100914.jar

Code: Select all
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package hu.bluesystem.docman;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.bind.JAXBElement;
import org.docx4j.XmlUtils;
import org.docx4j.fonts.IdentityPlusMapper;
import org.docx4j.fonts.Mapper;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.io.SaveToZipFile;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.wml.Body;

/**
* Creates a WordprocessingML document from scratch.
*
* @author Jason Harrop
* @version 1.0
*/
public class CreateWordprocessingMLDocument extends AbstractSample {


   
    public static void main(String[] args) throws Exception {
         Calendar c2 = Calendar.getInstance();
         FileOutputStream outxml = new FileOutputStream (new File("c:\\out.xml"));
       


        inputfilepath = "c:\\p1.docx";
        outputfilepath = "c:\\sample1.docx";

        WordprocessingMLPackage wmin = WordprocessingMLPackage.load(new File(inputfilepath));

        MainDocumentPart documentPart = wmin.getMainDocumentPart();
        org.docx4j.wml.Document wmlDocumentEl = (org.docx4j.wml.Document) documentPart.getJaxbElement();


        Body body = wmlDocumentEl.getBody();
        String xml = XmlUtils.marshaltoString(body, true);
        outxml.write(xml.getBytes());
        outxml.close();
        List<Object> bodyChildren = body.getEGBlockLevelElts();
        walkJAXBElements(bodyChildren);

//        Mapper fontMapper = new IdentityPlusMapper();
       //WordprocessingMLPackage.setFontMapper(fontMapper);


        SaveToZipFile saver = new SaveToZipFile(wmin);
        saver.save(outputfilepath);

        org.docx4j.convert.out.pdf.PdfConversion c
                = new org.docx4j.convert.out.pdf.viaXSLFO.Conversion(wmin);
        //                              = new org.docx4j.convert.out.pdf.viaIText.Conversion(wordMLPackage);

        ((org.docx4j.convert.out.pdf.viaXSLFO.Conversion) c).setSaveFO(new java.io.File("c:\\sample.fo"));
        OutputStream os = new java.io.FileOutputStream("c:\\sample.pdf");
        c.output(os);


        Calendar c1 = Calendar.getInstance();
        System.out.println("Time is : " + (c1.getTime().getTime() - c2.getTime().getTime()));
    //System.out.println(xml);
    }


    static void walkJAXBElements(List<Object> bodyChildren) {

        for (Object o : bodyChildren) {
            if (o instanceof org.docx4j.wml.P) {
                walkList(((org.docx4j.wml.P) o).getParagraphContent());
            }
            if (o instanceof javax.xml.bind.JAXBElement) {
                if (((JAXBElement) o).getDeclaredType().toString().equals("class org.docx4j.wml.Tbl")) {
                    org.docx4j.wml.Tbl t = (org.docx4j.wml.Tbl) ((JAXBElement) o).getValue();
                    listtbl(t);
                }
         
            }
        }
    }

        static void listtbl (org.docx4j.wml.Tbl tbl) {
            for (Object o : tbl.getEGContentRowContent()) {

                if (o instanceof org.docx4j.wml.Tr) {

                    org.docx4j.wml.Tr tr = (org.docx4j.wml.Tr) o;

                    for (Object o2 : tr.getEGContentCellContent()) {

                        //System.out.println(" " + o2.getClass().getName());
                        if (o2 instanceof javax.xml.bind.JAXBElement) {

                            if (((JAXBElement) o2).getDeclaredType().getName().equals("org.docx4j.wml.Tc")) {
                                org.docx4j.wml.Tc tc = (org.docx4j.wml.Tc) ((JAXBElement) o2).getValue();
                               
                                // Look at the paragraphs in the tc
                                walkJAXBElements(tc.getEGBlockLevelElts());                               

                            } else {
                                // What is it, if it isn't a Tc?
                                System.out.println("      " + ((JAXBElement) o).getName());
                                System.out.println("      " + ((JAXBElement) o).getDeclaredType().getName());
                            }
                        } else {
                            System.out.println("  " + o.getClass().getName());
                        }
                    }
                } else {
                    System.out.println("  " + o.getClass().getName());
                }

            }
   }

    static void walkList(List children) {
       
         HashMap hm = new HashMap();
         hm.put("[CEGNEVE]",".......");
         hm.put("[CEGCIME]","hggg");
         hm.put("[CEGADOSZAM]","21111");

         Set set = hm.entrySet();
     

        for (Object o : children) {
            if (o instanceof javax.xml.bind.JAXBElement) {
               // System.out.println(((JAXBElement) o).getDeclaredType().getName());
                if (((JAXBElement) o).getDeclaredType().getName().equals("org.docx4j.wml.Text")) {
                    org.docx4j.wml.Text t = (org.docx4j.wml.Text) ((JAXBElement) o).getValue();
                    Iterator i = set.iterator();                   
                    while(i.hasNext()) {
                       Map.Entry me=(Map.Entry)i.next();
                       if (t.getValue().equals(me.getKey())) {
                           t.setValue(me.getValue().toString());
                       }
                    }
                }

            } else if (o instanceof org.docx4j.wml.R) {
                org.docx4j.wml.R run = (org.docx4j.wml.R) o;
                walkList(run.getRunContent());
            }
        }

    }
}




If I run this I get the following:

run:
JAXB: RI not present. Trying Java 6 implementation.
JAXB: Using Java 6 implementation.
log4j:WARN No appenders could be found for logger (org.docx4j.jaxb.Context).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" org.docx4j.openpackaging.exceptions.Docx4JException: FOP issues
at org.docx4j.convert.out.pdf.viaXSLFO.Conversion.output(Conversion.java:370)
at hu.bluesystem.docman.CreateWordprocessingMLDocument.main(CreateWordprocessingMLDocument.java:71)
Caused by: java.lang.NullPointerException
at org.docx4j.model.PropertyResolver.init(PropertyResolver.java:193)
at org.docx4j.model.PropertyResolver.<init>(PropertyResolver.java:161)
at org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart.getPropertyResolver(MainDocumentPart.java:177)
at org.docx4j.openpackaging.packages.WordprocessingMLPackage.getDefaultFont(WordprocessingMLPackage.java:368)
at org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart.fontsInUse(MainDocumentPart.java:341)
at org.docx4j.convert.out.pdf.viaXSLFO.Conversion.declareFonts(Conversion.java:145)
at org.docx4j.convert.out.pdf.viaXSLFO.Conversion.output(Conversion.java:250)
... 1 more
Java Result: 1

I really appreciate if someone could point to the problem.
Thank you in advance!

Sandor

Re: docx to pdf conversion problem

PostPosted: Wed Oct 13, 2010 11:43 pm
by jason
Hi

It looks like your docx was created in OpenOffice (or something other than Word).

The problem is that it does not contain a default paragraph style.

I've just made it fallback to style0 as the default paragraph style; please try the new nightly http://dev.plutext.org/docx4j/docx4j-ni ... 101013.jar

I notice your code is timing PDF creation. Please bear in mind that JAXB Context setup takes a while, but only needs to be done once (not once per docx). Also, for improved performance, you should look at reusing your FOP configuration (so font setup is just done once).

hope this helps .. Jason

Re: docx to pdf conversion problem

PostPosted: Wed Oct 13, 2010 11:56 pm
by sfeher
jason wrote:Hi

It looks like your docx was created in OpenOffice (or something other than Word).

Yes, in Openoffice.
jason wrote:The problem is that it does not contain a default paragraph style.

I've just made it fallback to style0 as the default paragraph style; please try the new nightly http://dev.plutext.org/docx4j/docx4j-ni ... 101013.jar

Works like a charm!

jason wrote:I notice your code is timing PDF creation. Please bear in mind that JAXB Context setup takes a while, but only needs to be done once (not once per docx). Also, for improved performance, you should look at reusing your FOP configuration (so font setup is just done once).
hope this helps .. Jason

Yes. I check the timing because the converter I work on should be called from command line from another app. But I will pop up a JFrame for calming the end-user :) so timing is important only for me.

At last a big respect to you Jason. This is what I call support!

Re: docx to pdf conversion problem

PostPosted: Thu Oct 14, 2010 1:11 am
by sfeher
So. Hit another problem with the file attached.


Code: Select all
run:
JAXB: RI not present.  Trying Java 6 implementation.
JAXB: Using Java 6 implementation.
log4j:WARN No appenders could be found for logger (org.docx4j.jaxb.Context).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" org.docx4j.openpackaging.exceptions.Docx4JException: FOP issues
        at org.docx4j.convert.out.pdf.viaXSLFO.Conversion.output(Conversion.java:355)
        at hu.bluesystem.docman.CreateWordprocessingMLDocument.main(CreateWordprocessingMLDocument.java:71)
Caused by: javax.xml.transform.TransformerException: org.apache.fop.fo.ValidationException: "fo:table-cell" is missing child elements.
Required content model: marker* (%block;)+ (See position 19:332)
        at org.apache.xalan.transformer.TransformerIdentityImpl.transform(TransformerIdentityImpl.java:501)
        at org.docx4j.convert.out.pdf.viaXSLFO.Conversion.output(Conversion.java:347)
        ... 1 more
Caused by: org.apache.fop.fo.ValidationException: "fo:table-cell" is missing child elements.
Required content model: marker* (%block;)+ (See position 19:332)
        at org.apache.fop.events.ValidationExceptionFactory.createException(ValidationExceptionFactory.java:38)
        at org.apache.fop.events.EventExceptionManager.throwException(EventExceptionManager.java:54)
        at org.apache.fop.events.DefaultEventBroadcaster$1.invoke(DefaultEventBroadcaster.java:152)
        at $Proxy37.missingChildElement(Unknown Source)
        at org.apache.fop.fo.FONode.missingChildElementError(FONode.java:564)
        at org.apache.fop.fo.flow.table.TableCell.finalizeNode(TableCell.java:113)
        at org.apache.fop.fo.FONode.endOfNode(FONode.java:329)
        at org.apache.fop.fo.flow.table.TableCell.endOfNode(TableCell.java:105)
        at org.apache.fop.fo.FOTreeBuilder$MainFOHandler.endElement(FOTreeBuilder.java:348)
        at org.apache.fop.fo.FOTreeBuilder.endElement(FOTreeBuilder.java:177)
        at org.apache.xalan.transformer.TransformerIdentityImpl.endElement(TransformerIdentityImpl.java:1101)
        at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.endElement(AbstractSAXParser.java:601)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanEndElement(XMLDocumentFragmentScannerImpl.java:1774)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl$FragmentContentDriver.next(XMLDocumentFragmentScannerImpl.java:2930)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648)
        at com.sun.org.apache.xerces.internal.impl.XMLNSDocumentScannerImpl.next(XMLNSDocumentScannerImpl.java:140)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:510)
        at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:807)
        at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737)
        at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:107)
        at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(AbstractSAXParser.java:1205)
        at org.apache.xalan.transformer.TransformerIdentityImpl.transform(TransformerIdentityImpl.java:484)
        ... 2 more
Java Result: 1


This docx was made in Office 2003.
Thanks a lot!

Re: docx to pdf conversion problem

PostPosted: Thu Oct 14, 2010 6:09 pm
by jason
Hello, your attached docx works for me, through both the code you posted and the standard CreatePdf sample.

All I can tell you right now is that your stack trace relates to the conversion of a table; once of its cells ended up missing content.

Please feel free to post again if you can give me instructions to repro.

Re: docx to pdf conversion problem

PostPosted: Thu Oct 14, 2010 6:58 pm
by sfeher
Hello,

As I mentioned the file was made in Office 2003 with Microsoft Office Compatibility Pack.
I have opened this file in Openoffice.org then saved and ran my project.
It worked fine. So there could be some internal problem related to MS compatibility pack.
I reopened this in MS Office 2000, saved and works fine as well.
I need to fix some problems related to pdf. Page numbers at the left side, empty lines, setting margins, line objects declared as non implemented: support for w:pict - without w:imagedata
Please advice. Thank you again!

Re: docx to pdf conversion problem

PostPosted: Thu Oct 14, 2010 8:29 pm
by jason
sfeher wrote:I need to fix some problems related to pdf. Page numbers at the left side, empty lines, setting margins, line objects declared as non implemented: support for w:pict - without w:imagedata


Could you post a docx which exhibits the problems?

I'm not really planning to do any significant work myself on the PDF in the near future (unless engaged to do so by a customer), so you may need to look at the source code yourself. Happy to give you tips along the way, especially if you can contribute your changes back.

.. Jason