| 1 | /* |
|---|
| 2 | * Copyright 2007-2008, Plutext Pty Ltd. |
|---|
| 3 | * |
|---|
| 4 | * This file is part of docx4j. |
|---|
| 5 | |
|---|
| 6 | docx4j is licensed under the Apache License, Version 2.0 (the "License"); |
|---|
| 7 | you may not use this file except in compliance with the License. |
|---|
| 8 | |
|---|
| 9 | You may obtain a copy of the License at |
|---|
| 10 | |
|---|
| 11 | http://www.apache.org/licenses/LICENSE-2.0 |
|---|
| 12 | |
|---|
| 13 | Unless required by applicable law or agreed to in writing, software |
|---|
| 14 | distributed under the License is distributed on an "AS IS" BASIS, |
|---|
| 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|---|
| 16 | See the License for the specific language governing permissions and |
|---|
| 17 | limitations under the License. |
|---|
| 18 | |
|---|
| 19 | */ |
|---|
| 20 | |
|---|
| 21 | package org.docx4j.diff; |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | import java.io.File; |
|---|
| 25 | import java.io.FileInputStream; |
|---|
| 26 | import java.io.IOException; |
|---|
| 27 | import java.io.Reader; |
|---|
| 28 | import java.io.StringReader; |
|---|
| 29 | import java.io.StringWriter; |
|---|
| 30 | import java.io.Writer; |
|---|
| 31 | import java.text.SimpleDateFormat; |
|---|
| 32 | import java.util.Calendar; |
|---|
| 33 | import java.util.List; |
|---|
| 34 | import java.util.ArrayList; |
|---|
| 35 | import java.util.Map; |
|---|
| 36 | |
|---|
| 37 | |
|---|
| 38 | import javax.xml.bind.JAXBContext; |
|---|
| 39 | import javax.xml.bind.JAXBElement; |
|---|
| 40 | import javax.xml.bind.Marshaller; |
|---|
| 41 | import javax.xml.bind.Unmarshaller; |
|---|
| 42 | import javax.xml.parsers.DocumentBuilder; |
|---|
| 43 | import javax.xml.parsers.DocumentBuilderFactory; |
|---|
| 44 | import javax.xml.transform.Source; |
|---|
| 45 | import javax.xml.transform.Templates; |
|---|
| 46 | import javax.xml.transform.TransformerConfigurationException; |
|---|
| 47 | import javax.xml.transform.stream.StreamResult; |
|---|
| 48 | import javax.xml.transform.stream.StreamSource; |
|---|
| 49 | |
|---|
| 50 | import javax.xml.stream.*; |
|---|
| 51 | import javax.xml.stream.events.*; |
|---|
| 52 | import javax.xml.stream.XMLOutputFactory; |
|---|
| 53 | import javax.xml.stream.XMLStreamWriter; |
|---|
| 54 | |
|---|
| 55 | import org.apache.log4j.Logger; |
|---|
| 56 | import org.docx4j.XmlUtils; |
|---|
| 57 | import org.docx4j.wml.P; |
|---|
| 58 | import org.docx4j.wml.R; |
|---|
| 59 | |
|---|
| 60 | import org.eclipse.compare.StringComparator; |
|---|
| 61 | import org.eclipse.compare.rangedifferencer.RangeDifference; |
|---|
| 62 | import org.docx4j.jaxb.Context; |
|---|
| 63 | import org.docx4j.openpackaging.parts.relationships.RelationshipsPart; |
|---|
| 64 | import org.docx4j.relationships.Relationship; |
|---|
| 65 | |
|---|
| 66 | import org.eclipse.compare.rangedifferencer.RangeDifferencer; |
|---|
| 67 | import org.w3c.dom.Document; |
|---|
| 68 | import org.w3c.dom.Node; |
|---|
| 69 | import org.xml.sax.InputSource; |
|---|
| 70 | |
|---|
| 71 | import com.topologi.diffx.Docx4jDriver; |
|---|
| 72 | import com.topologi.diffx.Main; |
|---|
| 73 | import com.topologi.diffx.config.DiffXConfig; |
|---|
| 74 | |
|---|
| 75 | |
|---|
| 76 | |
|---|
| 77 | /** |
|---|
| 78 | * Capable of comparing a pair of: |
|---|
| 79 | * - w:body (only lightly tested) |
|---|
| 80 | * - w:sdtContent (used extensively) |
|---|
| 81 | * - w:p (includes an algorithm aimed at producing a better diff) |
|---|
| 82 | * |
|---|
| 83 | * See org.docx4j.samples.CompareDocuments for an example of how to use. |
|---|
| 84 | * |
|---|
| 85 | * @author jason |
|---|
| 86 | * |
|---|
| 87 | */ |
|---|
| 88 | public class Differencer { |
|---|
| 89 | |
|---|
| 90 | /* |
|---|
| 91 | * TODO: |
|---|
| 92 | * |
|---|
| 93 | * - handle spaces properly (encode real spaces as something before splitting, |
|---|
| 94 | * and add back in at end |
|---|
| 95 | * |
|---|
| 96 | */ |
|---|
| 97 | |
|---|
| 98 | protected static Logger log = Logger.getLogger(Differencer.class); |
|---|
| 99 | |
|---|
| 100 | |
|---|
| 101 | // For XSLT |
|---|
| 102 | public static void log(String message ) { |
|---|
| 103 | log.info(message); |
|---|
| 104 | } |
|---|
| 105 | |
|---|
| 106 | |
|---|
| 107 | |
|---|
| 108 | static org.docx4j.wml.ObjectFactory wmlFactory = new org.docx4j.wml.ObjectFactory(); |
|---|
| 109 | |
|---|
| 110 | // The rels used in the resulting diff |
|---|
| 111 | private List<Relationship> composedRels = new ArrayList<Relationship>(); |
|---|
| 112 | public List<Relationship> getComposedRels() { |
|---|
| 113 | return composedRels; |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | |
|---|
| 117 | |
|---|
| 118 | final private static SimpleDateFormat RFC3339_FORMAT |
|---|
| 119 | = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); |
|---|
| 120 | |
|---|
| 121 | // SimpleDateFormat is not thread-safe see: |
|---|
| 122 | // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6231579 |
|---|
| 123 | // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6178997 |
|---|
| 124 | // solution is to use stateless MessageFormat instead: |
|---|
| 125 | // final private static String RFC3339_FORMAT = "yyyy-MM-dd'T'HH:mm:ss"; |
|---|
| 126 | // final private static String RFC3339_PATTERN = "{0,date," + RFC3339_FORMAT + "}"; |
|---|
| 127 | |
|---|
| 128 | static Templates xsltDiffx2Wml; |
|---|
| 129 | |
|---|
| 130 | /** |
|---|
| 131 | * org/docx4j/diff/diffx2wml.xslt will be used by default |
|---|
| 132 | * to transform the diff output into a Word docx with tracked |
|---|
| 133 | * changes. This method allows you to use your own xslt |
|---|
| 134 | * instead. |
|---|
| 135 | * @param xsltDiffx2Wml |
|---|
| 136 | */ |
|---|
| 137 | public static void setXsltDiffx2Wml(Templates xsltDiffx2Wml) { |
|---|
| 138 | Differencer.xsltDiffx2Wml = xsltDiffx2Wml; |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | |
|---|
| 142 | |
|---|
| 143 | static Templates xsltMarkupInsert; |
|---|
| 144 | static Templates xsltMarkupDelete; |
|---|
| 145 | |
|---|
| 146 | static { |
|---|
| 147 | try { |
|---|
| 148 | Source xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils |
|---|
| 149 | .getResource("org/docx4j/diff/diffx2wml.xslt")); |
|---|
| 150 | xsltDiffx2Wml = XmlUtils.getTransformerTemplate(xsltSource); |
|---|
| 151 | |
|---|
| 152 | xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils |
|---|
| 153 | .getResource("org/docx4j/diff/MarkupInsert.xslt")); |
|---|
| 154 | xsltMarkupInsert = XmlUtils.getTransformerTemplate(xsltSource); |
|---|
| 155 | |
|---|
| 156 | xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils |
|---|
| 157 | .getResource("org/docx4j/diff/MarkupDelete.xslt")); |
|---|
| 158 | xsltMarkupDelete = XmlUtils.getTransformerTemplate(xsltSource); |
|---|
| 159 | } catch (IOException e) { |
|---|
| 160 | e.printStackTrace(); |
|---|
| 161 | } catch (TransformerConfigurationException e) { |
|---|
| 162 | e.printStackTrace(); |
|---|
| 163 | } |
|---|
| 164 | |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | // /** |
|---|
| 168 | // * @param args |
|---|
| 169 | // */ |
|---|
| 170 | // public static void main(String[] args) throws Exception { |
|---|
| 171 | // |
|---|
| 172 | // String BASE_DIR = "/home/dev/workspace/docx4j/src/test/java/org/docx4j/diff/"; |
|---|
| 173 | // |
|---|
| 174 | // // Test setup |
|---|
| 175 | // String paraL = BASE_DIR + "t2R"; |
|---|
| 176 | // String paraR = BASE_DIR + "t3L"; |
|---|
| 177 | // P pl = loadParagraph(paraL); |
|---|
| 178 | // P pr = loadParagraph(paraR); |
|---|
| 179 | // |
|---|
| 180 | // // Result format |
|---|
| 181 | // StreamResult result = new StreamResult(System.out); |
|---|
| 182 | // |
|---|
| 183 | // // Run the diff - FIXME |
|---|
| 184 | // Differencer pd = new Differencer(); |
|---|
| 185 | // pd.diff(pl, pr, result, null, null, null, null); |
|---|
| 186 | // |
|---|
| 187 | // } |
|---|
| 188 | |
|---|
| 189 | /** |
|---|
| 190 | * The id to be allocated to the ins/del |
|---|
| 191 | * @return |
|---|
| 192 | */ |
|---|
| 193 | public final static Integer getId() { |
|---|
| 194 | return ++nextId; |
|---|
| 195 | } |
|---|
| 196 | public static Integer nextId = 0; |
|---|
| 197 | |
|---|
| 198 | |
|---|
| 199 | /** |
|---|
| 200 | * Because the resulting document might be built out of the |
|---|
| 201 | * results of a number of diffs, we need to be sure that the id's |
|---|
| 202 | * are unique across these diffs. |
|---|
| 203 | * |
|---|
| 204 | * This is passed into the XSLT, where it is used as part |
|---|
| 205 | * of the generated rel id. |
|---|
| 206 | * |
|---|
| 207 | * @return the |
|---|
| 208 | */ |
|---|
| 209 | private String relsDiffIdentifier; |
|---|
| 210 | /** |
|---|
| 211 | * @param relsDiffIdentifier the relsDiffIdentifier to set |
|---|
| 212 | */ |
|---|
| 213 | public void setRelsDiffIdentifier(String relsDiffIdentifier) { |
|---|
| 214 | this.relsDiffIdentifier = relsDiffIdentifier; |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | /** |
|---|
| 218 | * Any rel which is present in the results of the comparison must point to |
|---|
| 219 | * a valid target of the correct type, or the resulting document will |
|---|
| 220 | * be broken. |
|---|
| 221 | * |
|---|
| 222 | * So we pass the old and new rels objects, and |
|---|
| 223 | * progressively build up a List of relationships which will need to be |
|---|
| 224 | * in the resulting document. |
|---|
| 225 | * |
|---|
| 226 | * Because the resulting document might be built out of the |
|---|
| 227 | * results of a number of diffs, we need to be sure that the id's |
|---|
| 228 | * are unique across these diffs. |
|---|
| 229 | * |
|---|
| 230 | * @return the |
|---|
| 231 | */ |
|---|
| 232 | public static void registerRelationship(Differencer pd, |
|---|
| 233 | RelationshipsPart docPartRels, String relId, |
|---|
| 234 | String newRelId ) { |
|---|
| 235 | |
|---|
| 236 | |
|---|
| 237 | if (docPartRels==null) { |
|---|
| 238 | // (In this case, Xalan won't even be able to find this function) |
|---|
| 239 | return; |
|---|
| 240 | } |
|---|
| 241 | |
|---|
| 242 | if (docPartRels.getRelationships()==null) { |
|---|
| 243 | log.warn("relationships object is null!"); |
|---|
| 244 | return; |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | |
|---|
| 248 | log.error("Looking for rel " + relId); |
|---|
| 249 | Relationship r = docPartRels.getRelationshipByID(relId); |
|---|
| 250 | if (r==null) { |
|---|
| 251 | log.error("Couldn't find rel " + relId); |
|---|
| 252 | return; |
|---|
| 253 | } |
|---|
| 254 | |
|---|
| 255 | Relationship r2 = (Relationship)XmlUtils.deepCopy(r, Context.jcRelationships); |
|---|
| 256 | |
|---|
| 257 | r2.setId(newRelId); |
|---|
| 258 | log.error(".. added rel " + newRelId + " -- " + r2.getTarget() ); |
|---|
| 259 | |
|---|
| 260 | pd.composedRels.add(r2); |
|---|
| 261 | } |
|---|
| 262 | |
|---|
| 263 | /** |
|---|
| 264 | * Compare 2 p objects, returning a result containing |
|---|
| 265 | * w:ins and w:del elements |
|---|
| 266 | * |
|---|
| 267 | * @param pl - the left paragraph |
|---|
| 268 | * @param pr - the right paragraph |
|---|
| 269 | * @param result |
|---|
| 270 | */ |
|---|
| 271 | public void diff(P pl, P pr, javax.xml.transform.Result result, |
|---|
| 272 | String author, java.util.Calendar date, |
|---|
| 273 | RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight) { |
|---|
| 274 | |
|---|
| 275 | diff(pl, pr, result, |
|---|
| 276 | author, date, |
|---|
| 277 | docPartRelsLeft, docPartRelsRight, |
|---|
| 278 | false); |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | public void diff(org.docx4j.wml.SdtContentBlock cbNewer, |
|---|
| 282 | org.docx4j.wml.SdtContentBlock cbOlder, |
|---|
| 283 | javax.xml.transform.Result result, |
|---|
| 284 | String author, java.util.Calendar date, |
|---|
| 285 | RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) { |
|---|
| 286 | |
|---|
| 287 | this.diffWorker(org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbNewer).getDocumentElement(), |
|---|
| 288 | org.docx4j.XmlUtils.marshaltoW3CDomDocument(cbOlder).getDocumentElement(), |
|---|
| 289 | result, author, date, docPartRelsNewer, docPartRelsOlder); |
|---|
| 290 | } |
|---|
| 291 | |
|---|
| 292 | public void diff(org.docx4j.wml.Body newer, |
|---|
| 293 | org.docx4j.wml.Body older, |
|---|
| 294 | javax.xml.transform.Result result, |
|---|
| 295 | String author, java.util.Calendar date, |
|---|
| 296 | RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) { |
|---|
| 297 | |
|---|
| 298 | this.diffWorker( |
|---|
| 299 | org.docx4j.XmlUtils.marshaltoW3CDomDocument(newer).getDocumentElement(), |
|---|
| 300 | org.docx4j.XmlUtils.marshaltoW3CDomDocument(older).getDocumentElement(), |
|---|
| 301 | result, author, date, docPartRelsNewer, docPartRelsOlder); |
|---|
| 302 | } |
|---|
| 303 | |
|---|
| 304 | /** |
|---|
| 305 | * This is private, in order to control what objects the user |
|---|
| 306 | * can invoke diff on. At present there are public methods for |
|---|
| 307 | * pairs of w:body, w:sdtContent, and w:p. |
|---|
| 308 | * |
|---|
| 309 | * TODO: consider/test w:table! |
|---|
| 310 | */ |
|---|
| 311 | private void diffWorker(Node newer, |
|---|
| 312 | Node older, |
|---|
| 313 | javax.xml.transform.Result result, |
|---|
| 314 | String author, java.util.Calendar date, |
|---|
| 315 | RelationshipsPart docPartRelsNewer, RelationshipsPart docPartRelsOlder) { |
|---|
| 316 | |
|---|
| 317 | Writer diffxResult = new StringWriter(); |
|---|
| 318 | |
|---|
| 319 | try { |
|---|
| 320 | Docx4jDriver.diff(newer, |
|---|
| 321 | older, |
|---|
| 322 | diffxResult); |
|---|
| 323 | // The signature which takes Reader objects appears to be broken |
|---|
| 324 | diffxResult.close(); |
|---|
| 325 | } catch (Exception exc) { |
|---|
| 326 | exc.printStackTrace(); |
|---|
| 327 | diffxResult = null; |
|---|
| 328 | } |
|---|
| 329 | |
|---|
| 330 | try { |
|---|
| 331 | |
|---|
| 332 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); |
|---|
| 333 | /* |
|---|
| 334 | * With JDK 1.5, you need to supply a stax jar, or you |
|---|
| 335 | * will get: |
|---|
| 336 | * |
|---|
| 337 | * javax.xml.stream.FactoryConfigurationError: Provider com.bea.xml.stream.MXParserFactory not found |
|---|
| 338 | at javax.xml.stream.FactoryFinder.newInstance(FactoryFinder.java:72) |
|---|
| 339 | at javax.xml.stream.FactoryFinder.find(FactoryFinder.java:176) |
|---|
| 340 | at javax.xml.stream.FactoryFinder.find(FactoryFinder.java:92) |
|---|
| 341 | at javax.xml.stream.XMLInputFactory.newInstance(XMLInputFactory.java:136) |
|---|
| 342 | |
|---|
| 343 | * This is not necessary if you use Java 6. |
|---|
| 344 | * |
|---|
| 345 | * From http://java.sun.com/webservices/docs/1.6/tutorial/doc/SJSXP4.html |
|---|
| 346 | * |
|---|
| 347 | * The XMLInputFactory class lets you configure implementation instances of XML |
|---|
| 348 | * stream reader processors created by the factory. New instances of the abstract |
|---|
| 349 | * class XMLInputFactory are created by calling the newInstance() method on the |
|---|
| 350 | * class. The static method XMLInputFactory.newInstance() is then used to create |
|---|
| 351 | * a new factory instance. |
|---|
| 352 | |
|---|
| 353 | Deriving from JAXP, the XMLInputFactory.newInstance() method determines the |
|---|
| 354 | specific XMLInputFactory implementation class to load by using the following |
|---|
| 355 | lookup procedure: |
|---|
| 356 | |
|---|
| 357 | 1. Use the javax.xml.stream.XMLInputFactory system property. |
|---|
| 358 | |
|---|
| 359 | 2. Use the lib/xml.stream.properties file in the JRE directory. |
|---|
| 360 | |
|---|
| 361 | 3. Use the Services API, if available, to determine the classname |
|---|
| 362 | by looking in the META-INF/services/javax.xml.stream.XMLInputFactory |
|---|
| 363 | files in jars available to the JRE. |
|---|
| 364 | |
|---|
| 365 | 4. Use the platform default XMLInputFactory instance. |
|---|
| 366 | * |
|---|
| 367 | */ |
|---|
| 368 | |
|---|
| 369 | |
|---|
| 370 | //java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8")); |
|---|
| 371 | Reader reader; |
|---|
| 372 | if (log.isDebugEnabled() ) { |
|---|
| 373 | String res = diffxResult.toString(); |
|---|
| 374 | log.debug("Diff result:" + res); |
|---|
| 375 | reader = new StringReader(res); |
|---|
| 376 | } else { |
|---|
| 377 | reader = new StringReader(diffxResult.toString()); |
|---|
| 378 | } |
|---|
| 379 | |
|---|
| 380 | String simplified = null; |
|---|
| 381 | try { |
|---|
| 382 | simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) ); |
|---|
| 383 | } catch (XMLStreamException e) { |
|---|
| 384 | e.printStackTrace(); |
|---|
| 385 | // log.debug("left: " + XmlUtils.marshaltoString(objectLeft, true, false)); |
|---|
| 386 | // log.debug("right: " + XmlUtils.marshaltoString(objectRight, true, false)); |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | log.debug("\n\n Diff'd input to transform: \n\n" + simplified ); |
|---|
| 390 | |
|---|
| 391 | StreamSource src = new StreamSource(new StringReader(simplified)); |
|---|
| 392 | transformDiffxOutputToWml(result, author, date, docPartRelsNewer, |
|---|
| 393 | docPartRelsOlder, src); |
|---|
| 394 | |
|---|
| 395 | } catch (Exception exc) { |
|---|
| 396 | exc.printStackTrace(); |
|---|
| 397 | } |
|---|
| 398 | |
|---|
| 399 | } |
|---|
| 400 | |
|---|
| 401 | /** |
|---|
| 402 | * @param result |
|---|
| 403 | * @param author |
|---|
| 404 | * @param date |
|---|
| 405 | * @param docPartRelsLeft |
|---|
| 406 | * @param docPartRelsRight |
|---|
| 407 | * @param src |
|---|
| 408 | * @throws Exception |
|---|
| 409 | */ |
|---|
| 410 | private void transformDiffxOutputToWml(javax.xml.transform.Result result, |
|---|
| 411 | String author, java.util.Calendar date, |
|---|
| 412 | RelationshipsPart docPartRelsLeft, |
|---|
| 413 | RelationshipsPart docPartRelsRight, StreamSource src) |
|---|
| 414 | throws Exception { |
|---|
| 415 | Map<String, Object> transformParameters = new java.util.HashMap<String, Object>(); |
|---|
| 416 | |
|---|
| 417 | String dateString; |
|---|
| 418 | if (date!=null) { |
|---|
| 419 | dateString = RFC3339_FORMAT.format(date.getTime()) ; |
|---|
| 420 | } else { |
|---|
| 421 | // TODO FIXME - JAXB requires a real date. |
|---|
| 422 | // What to give it? |
|---|
| 423 | // The alternative is to change the xslt |
|---|
| 424 | // to omit the @date entirely if its unknown |
|---|
| 425 | dateString = "2009-03-11T17:57:00Z"; |
|---|
| 426 | } |
|---|
| 427 | transformParameters.put("Differencer", this); |
|---|
| 428 | transformParameters.put("date", dateString); |
|---|
| 429 | transformParameters.put("author", author); |
|---|
| 430 | transformParameters.put("docPartRelsLeft", docPartRelsLeft); |
|---|
| 431 | transformParameters.put("docPartRelsRight", docPartRelsRight); |
|---|
| 432 | transformParameters.put("relsDiffIdentifier", relsDiffIdentifier); |
|---|
| 433 | |
|---|
| 434 | XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result); |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | public void markupAsInsertion(org.docx4j.wml.SdtContentBlock cbLeft, |
|---|
| 438 | javax.xml.transform.Result result, |
|---|
| 439 | String author, java.util.Calendar date, |
|---|
| 440 | RelationshipsPart docPartRelsLeft) { |
|---|
| 441 | |
|---|
| 442 | Writer diffxResult = new StringWriter(); |
|---|
| 443 | |
|---|
| 444 | try { |
|---|
| 445 | |
|---|
| 446 | // Now marshall it |
|---|
| 447 | JAXBContext jc = Context.jc; |
|---|
| 448 | Marshaller marshaller=jc.createMarshaller(); |
|---|
| 449 | org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument(); |
|---|
| 450 | |
|---|
| 451 | marshaller.marshal(cbLeft, doc); |
|---|
| 452 | |
|---|
| 453 | |
|---|
| 454 | Map<String, Object> transformParameters = new java.util.HashMap<String, Object>(); |
|---|
| 455 | |
|---|
| 456 | if (date!=null) { |
|---|
| 457 | String dateString = RFC3339_FORMAT.format(date.getTime()) ; |
|---|
| 458 | transformParameters.put("date", dateString); |
|---|
| 459 | } |
|---|
| 460 | |
|---|
| 461 | transformParameters.put("Differencer", this); |
|---|
| 462 | transformParameters.put("author", author); |
|---|
| 463 | transformParameters.put("docPartRelsLeft", docPartRelsLeft); |
|---|
| 464 | transformParameters.put("docPartRelsRight", null); |
|---|
| 465 | transformParameters.put("relsDiffIdentifier", relsDiffIdentifier); |
|---|
| 466 | XmlUtils.transform(doc, xsltMarkupInsert, transformParameters, result); |
|---|
| 467 | |
|---|
| 468 | } catch (Exception exc) { |
|---|
| 469 | exc.printStackTrace(); |
|---|
| 470 | } |
|---|
| 471 | |
|---|
| 472 | } |
|---|
| 473 | |
|---|
| 474 | public void markupAsDeletion(org.docx4j.wml.SdtContentBlock cbLeft, |
|---|
| 475 | javax.xml.transform.Result result, |
|---|
| 476 | String author, java.util.Calendar date, |
|---|
| 477 | RelationshipsPart docPartRelsRight) { |
|---|
| 478 | |
|---|
| 479 | Writer diffxResult = new StringWriter(); |
|---|
| 480 | |
|---|
| 481 | try { |
|---|
| 482 | |
|---|
| 483 | // Now marshall it |
|---|
| 484 | JAXBContext jc = Context.jc; |
|---|
| 485 | Marshaller marshaller=jc.createMarshaller(); |
|---|
| 486 | org.w3c.dom.Document doc = org.docx4j.XmlUtils.neww3cDomDocument(); |
|---|
| 487 | |
|---|
| 488 | marshaller.marshal(cbLeft, doc); |
|---|
| 489 | |
|---|
| 490 | |
|---|
| 491 | Map<String, Object> transformParameters = new java.util.HashMap<String, Object>(); |
|---|
| 492 | |
|---|
| 493 | if (date!=null) { |
|---|
| 494 | String dateString = RFC3339_FORMAT.format(date.getTime()) ; |
|---|
| 495 | transformParameters.put("date", dateString); |
|---|
| 496 | } |
|---|
| 497 | |
|---|
| 498 | transformParameters.put("Differencer", this); |
|---|
| 499 | transformParameters.put("author", author); |
|---|
| 500 | transformParameters.put("docPartRelsLeft", null); |
|---|
| 501 | transformParameters.put("docPartRelsRight", docPartRelsRight); |
|---|
| 502 | transformParameters.put("relsDiffIdentifier", relsDiffIdentifier); |
|---|
| 503 | XmlUtils.transform(doc, xsltMarkupDelete, transformParameters, result); |
|---|
| 504 | |
|---|
| 505 | } catch (Exception exc) { |
|---|
| 506 | exc.printStackTrace(); |
|---|
| 507 | } |
|---|
| 508 | |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | |
|---|
| 512 | /** |
|---|
| 513 | * Compare 2 p objects, returning a result containing |
|---|
| 514 | * w:ins and w:del elements |
|---|
| 515 | * |
|---|
| 516 | * @param pl - the left paragraph |
|---|
| 517 | * @param pr - the right paragraph |
|---|
| 518 | * @param result |
|---|
| 519 | */ |
|---|
| 520 | public void diff(P pl, P pr, javax.xml.transform.Result result, |
|---|
| 521 | String author, java.util.Calendar date, |
|---|
| 522 | RelationshipsPart docPartRelsLeft, RelationshipsPart docPartRelsRight, |
|---|
| 523 | boolean preProcess) { |
|---|
| 524 | |
|---|
| 525 | |
|---|
| 526 | |
|---|
| 527 | /* In order to get an optimal result when comparing 2 WML paragraphs, |
|---|
| 528 | * it helps if each can be made to contain matching runs. |
|---|
| 529 | * |
|---|
| 530 | * TODO: ensure each w:r contains one and only one w:t |
|---|
| 531 | * |
|---|
| 532 | * The process for achieving this involves running the LCS algorithm |
|---|
| 533 | * on the string content of the paragraph. |
|---|
| 534 | * |
|---|
| 535 | * At this point, you'd actually be done, if you didn't care about |
|---|
| 536 | * run formatting. |
|---|
| 537 | * |
|---|
| 538 | * But we do care about run formatting, so the relevant formatting |
|---|
| 539 | * is then re-attached to each of the sets of runs. |
|---|
| 540 | * |
|---|
| 541 | * The XML diff is then run on these 'normalised' paragraphs. |
|---|
| 542 | * It will tell which of the w:t have been populated/deleted, and |
|---|
| 543 | * what formatting has changed on their w:r elements. |
|---|
| 544 | * |
|---|
| 545 | * In terms of actual performance (versus plain old diffx), the |
|---|
| 546 | * main case where the pre-processing helps: |
|---|
| 547 | * |
|---|
| 548 | * 1. t2R cf t3L |
|---|
| 549 | * |
|---|
| 550 | Left input |
|---|
| 551 | |
|---|
| 552 | <w:p> |
|---|
| 553 | <w:r> |
|---|
| 554 | <w:t xml:space="preserve">The quick brown </w:t> |
|---|
| 555 | </w:r> |
|---|
| 556 | <w:r> |
|---|
| 557 | <w:rPr> |
|---|
| 558 | <w:b/> |
|---|
| 559 | <w:sz w:val="28"/> |
|---|
| 560 | <w:szCs w:val="28"/> |
|---|
| 561 | </w:rPr> |
|---|
| 562 | <w:t>fox</w:t> |
|---|
| 563 | </w:r> |
|---|
| 564 | <w:r> |
|---|
| 565 | <w:t xml:space="preserve"> jumped over the </w:t> |
|---|
| 566 | </w:r> |
|---|
| 567 | <w:r> |
|---|
| 568 | <w:rPr> |
|---|
| 569 | <w:u w:val="single"/> |
|---|
| 570 | </w:rPr> |
|---|
| 571 | <w:t>lazy</w:t> |
|---|
| 572 | </w:r> |
|---|
| 573 | <w:r> |
|---|
| 574 | <w:t xml:space="preserve"> dog.</w:t> |
|---|
| 575 | </w:r> |
|---|
| 576 | </w:p> |
|---|
| 577 | |
|---|
| 578 | |
|---|
| 579 | Right input |
|---|
| 580 | |
|---|
| 581 | <w:p> |
|---|
| 582 | <w:r> |
|---|
| 583 | <w:t>The quick brown fox jumped high </w:t> |
|---|
| 584 | </w:r> |
|---|
| 585 | <w:r> |
|---|
| 586 | <w:t>high over the lazy dog.</w:t> |
|---|
| 587 | </w:r> |
|---|
| 588 | </w:p> |
|---|
| 589 | |
|---|
| 590 | |
|---|
| 591 | * |
|---|
| 592 | */ |
|---|
| 593 | |
|---|
| 594 | String leftXmlOld = null; |
|---|
| 595 | String rightXmlOld = null; |
|---|
| 596 | if (!preProcess || log.isDebugEnabled() ) { |
|---|
| 597 | leftXmlOld = org.docx4j.XmlUtils.marshaltoString(pl, true, false); |
|---|
| 598 | rightXmlOld = org.docx4j.XmlUtils.marshaltoString(pr, true, false); |
|---|
| 599 | // NB boolean prettyprint must be set to false |
|---|
| 600 | // with diffxConfig |
|---|
| 601 | // .setIgnoreWhiteSpace(false); |
|---|
| 602 | // .setPreserveWhiteSpace(true); |
|---|
| 603 | // because otherwise we get ins, del around |
|---|
| 604 | // indentation whitespace, and this |
|---|
| 605 | // breaks the transform to wml. |
|---|
| 606 | |
|---|
| 607 | } |
|---|
| 608 | |
|---|
| 609 | if (!preProcess) { |
|---|
| 610 | |
|---|
| 611 | String naive = getDiffxOutput(leftXmlOld, rightXmlOld); |
|---|
| 612 | |
|---|
| 613 | // Debug purposes only! |
|---|
| 614 | log.debug("\n\n naive difference \n\n" ); |
|---|
| 615 | log.debug(naive) ; |
|---|
| 616 | |
|---|
| 617 | |
|---|
| 618 | log.info("\n\n <p> difference without preprocessing </p> \n\n" ); |
|---|
| 619 | try { |
|---|
| 620 | |
|---|
| 621 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); |
|---|
| 622 | //java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8")); |
|---|
| 623 | Reader reader = new StringReader(naive); |
|---|
| 624 | String simplified = combineAdjacent(inputFactory.createXMLStreamReader(reader) ); |
|---|
| 625 | |
|---|
| 626 | log.debug("\n\n combineAdjacent: \n\n" + simplified ); |
|---|
| 627 | |
|---|
| 628 | StreamSource src = new StreamSource(new StringReader(simplified)); |
|---|
| 629 | Map<String, Object> transformParameters = new java.util.HashMap<String, Object>(); |
|---|
| 630 | transformParameters.put("Differencer", this); |
|---|
| 631 | transformParameters.put("author", author); |
|---|
| 632 | transformParameters.put("docPartRelsLeft", docPartRelsLeft); |
|---|
| 633 | transformParameters.put("docPartRelsRight", docPartRelsRight); |
|---|
| 634 | transformParameters.put("relsDiffIdentifier", relsDiffIdentifier); |
|---|
| 635 | XmlUtils.transform(src, xsltDiffx2Wml, transformParameters, result); |
|---|
| 636 | |
|---|
| 637 | } catch (Exception exc) { |
|---|
| 638 | exc.printStackTrace(); |
|---|
| 639 | } |
|---|
| 640 | |
|---|
| 641 | return; |
|---|
| 642 | } |
|---|
| 643 | |
|---|
| 644 | |
|---|
| 645 | // Compute LCS |
|---|
| 646 | StringComparator left = new StringComparator(pl.toString()); |
|---|
| 647 | StringComparator right = new StringComparator(pr.toString()); |
|---|
| 648 | org.eclipse.compare.internal.LCSSettings settings = new org.eclipse.compare.internal.LCSSettings(); |
|---|
| 649 | |
|---|
| 650 | RangeDifference[] rd = RangeDifferencer.findRanges(settings, left, right); |
|---|
| 651 | |
|---|
| 652 | // Debug Output |
|---|
| 653 | if (log.isDebugEnabled()) { |
|---|
| 654 | log.debug("\n\n RangeDifferences \n\n"); |
|---|
| 655 | for (int x=0; x<rd.length; x++) { |
|---|
| 656 | log.debug ( |
|---|
| 657 | toRangeString( left, rd[x].leftStart(), rd[x].leftLength(), true ) |
|---|
| 658 | + rd[x].kindString() |
|---|
| 659 | + toRangeString( right, rd[x].rightStart(), rd[x].rightLength(), true ) ); |
|---|
| 660 | } |
|---|
| 661 | } |
|---|
| 662 | |
|---|
| 663 | // Now build appropriate replacement paragraph content |
|---|
| 664 | List<Object> pLeftReplacement = new ArrayList<Object>(); |
|---|
| 665 | List<Object> pRightReplacement = new ArrayList<Object>(); |
|---|
| 666 | |
|---|
| 667 | // Which of the _existing_ w:r we are up to |
|---|
| 668 | int pLeftIndex = 0; |
|---|
| 669 | int pRightIndex = 0; |
|---|
| 670 | |
|---|
| 671 | int[] leftCounts = getParagraphRunTextWordCounts(pl); |
|---|
| 672 | |
|---|
| 673 | // StringBuilder debug = new StringBuilder(); |
|---|
| 674 | // debug.append("{ "); |
|---|
| 675 | // for (int i=0; i < leftCounts.length; i++) { |
|---|
| 676 | // try { |
|---|
| 677 | // debug.append( leftCounts[i] + ", "); |
|---|
| 678 | // } catch (RuntimeException e) { |
|---|
| 679 | // } |
|---|
| 680 | // } |
|---|
| 681 | // System.out.println(debug); |
|---|
| 682 | |
|---|
| 683 | int[] rightCounts = getParagraphRunTextWordCounts(pr); |
|---|
| 684 | |
|---|
| 685 | int leftWordCounter = -1; |
|---|
| 686 | int rightWordCounter = -1; |
|---|
| 687 | |
|---|
| 688 | for (int x=0; x<rd.length; x++) { |
|---|
| 689 | |
|---|
| 690 | // The original runs are always longer than |
|---|
| 691 | // each rd |
|---|
| 692 | |
|---|
| 693 | // We will definitely require a new run |
|---|
| 694 | // structure for each side |
|---|
| 695 | R currentLeftStructure = createRunStructure("", |
|---|
| 696 | pl, pLeftIndex ); |
|---|
| 697 | R currentRightStructure = createRunStructure("", |
|---|
| 698 | pr, pRightIndex ); |
|---|
| 699 | |
|---|
| 700 | pLeftReplacement.add(currentLeftStructure); |
|---|
| 701 | pRightReplacement.add(currentRightStructure); |
|---|
| 702 | |
|---|
| 703 | if (rd[x].kind() == RangeDifference.NOCHANGE) { |
|---|
| 704 | log.debug("NOCHANGE"); |
|---|
| 705 | // These are part of the string LCS, |
|---|
| 706 | // (though they might not be part of the |
|---|
| 707 | // XML LCS once we've added their rPr |
|---|
| 708 | // back in.) |
|---|
| 709 | // This is where we focus our efforts. |
|---|
| 710 | |
|---|
| 711 | |
|---|
| 712 | // Process the words in rd[x] one word at a time |
|---|
| 713 | for (int i=rd[x].leftStart(); // left and right are identical |
|---|
| 714 | i<(rd[x].leftStart()+rd[x].leftLength()); i++) { |
|---|
| 715 | |
|---|
| 716 | // Our objective is to ensure that both the |
|---|
| 717 | // left and right paragraphs end up with |
|---|
| 718 | // matching w:r/w:t boundaries. |
|---|
| 719 | |
|---|
| 720 | // So when either of the existing paragraphs |
|---|
| 721 | // contains a boundary, this need to be inserted |
|---|
| 722 | // in both results |
|---|
| 723 | |
|---|
| 724 | String word = left.getLeaf(i); |
|---|
| 725 | |
|---|
| 726 | leftWordCounter++; |
|---|
| 727 | rightWordCounter++; |
|---|
| 728 | |
|---|
| 729 | // log.debug(word); |
|---|
| 730 | |
|---|
| 731 | if ( leftWordCounter < sum(leftCounts, 0, pLeftIndex) |
|---|
| 732 | && rightWordCounter < sum(rightCounts, 0, pRightIndex) ) { |
|---|
| 733 | |
|---|
| 734 | // it is ok to insert into current w:t |
|---|
| 735 | addWord(currentLeftStructure, word); |
|---|
| 736 | addWord(currentRightStructure, word); |
|---|
| 737 | |
|---|
| 738 | } else { |
|---|
| 739 | |
|---|
| 740 | // log.debug("Hit boundary"); |
|---|
| 741 | |
|---|
| 742 | // which boundary have we hit? |
|---|
| 743 | if (leftWordCounter == sum(leftCounts, 0, pLeftIndex) |
|---|
| 744 | && rightWordCounter == sum(rightCounts, 0, pRightIndex) ) { |
|---|
| 745 | // Quite likely, for example, same formatting in each |
|---|
| 746 | |
|---|
| 747 | // We're now on to each paragraph's next w:t |
|---|
| 748 | pLeftIndex++; |
|---|
| 749 | pRightIndex++; |
|---|
| 750 | |
|---|
| 751 | } else if (leftWordCounter == sum(leftCounts, 0, pLeftIndex) ) { |
|---|
| 752 | |
|---|
| 753 | // We're now on to the left paragraph's next w:t |
|---|
| 754 | pLeftIndex++; |
|---|
| 755 | |
|---|
| 756 | } else { |
|---|
| 757 | |
|---|
| 758 | // We're now on to the right paragraph's next w:t |
|---|
| 759 | pRightIndex++; |
|---|
| 760 | } |
|---|
| 761 | |
|---|
| 762 | currentLeftStructure = createRunStructure(word, |
|---|
| 763 | pl, pLeftIndex ); |
|---|
| 764 | currentRightStructure = createRunStructure(word, |
|---|
| 765 | pr, pRightIndex ); |
|---|
| 766 | |
|---|
| 767 | pLeftReplacement.add(currentLeftStructure); |
|---|
| 768 | pRightReplacement.add(currentRightStructure); |
|---|
| 769 | |
|---|
| 770 | } |
|---|
| 771 | |
|---|
| 772 | } |
|---|
| 773 | |
|---|
| 774 | } else if (rd[x].kind() == RangeDifference.CHANGE) { |
|---|
| 775 | log.debug("CHANGE"); |
|---|
| 776 | // These aren't part of the string LCS, |
|---|
| 777 | // (so they shouldn't be part of |
|---|
| 778 | // the XML LCS) |
|---|
| 779 | |
|---|
| 780 | // All we need to do is make sure that |
|---|
| 781 | // the input is round tripped. |
|---|
| 782 | |
|---|
| 783 | // Left side: Process the words in rd[x] one word at a time |
|---|
| 784 | // NB, can't just copy existing runs into the output |
|---|
| 785 | log.debug(".. left side"); |
|---|
| 786 | for (int i=rd[x].leftStart(); |
|---|
| 787 | i<(rd[x].leftStart()+rd[x].leftLength()); i++) { |
|---|
| 788 | |
|---|
| 789 | String word = left.getLeaf(i); |
|---|
| 790 | // log.debug(word); |
|---|
| 791 | leftWordCounter++; |
|---|
| 792 | |
|---|
| 793 | if ( leftWordCounter < sum(leftCounts, 0, pLeftIndex) ) { |
|---|
| 794 | // it is ok to insert into left's current w:t |
|---|
| 795 | addWord(currentLeftStructure, word); |
|---|
| 796 | } else { |
|---|
| 797 | // boundary hit |
|---|
| 798 | // We're now on to the left paragraph's next w:t |
|---|
| 799 | pLeftIndex++; |
|---|
| 800 | currentLeftStructure = createRunStructure(word, |
|---|
| 801 | pl, pLeftIndex ); |
|---|
| 802 | pLeftReplacement.add(currentLeftStructure); |
|---|
| 803 | } |
|---|
| 804 | |
|---|
| 805 | } |
|---|
| 806 | |
|---|
| 807 | // Right side |
|---|
| 808 | log.debug(".. right side"); |
|---|
| 809 | for (int i=rd[x].rightStart(); |
|---|
| 810 | i<(rd[x].rightStart()+rd[x].rightLength()); i++) { |
|---|
| 811 | |
|---|
| 812 | String word = right.getLeaf(i); |
|---|
| 813 | log.debug(word); |
|---|
| 814 | rightWordCounter++; |
|---|
| 815 | |
|---|
| 816 | if ( rightWordCounter < sum(rightCounts, 0, pRightIndex) ) { |
|---|
| 817 | // it is ok to insert into right's current w:t |
|---|
| 818 | addWord(currentRightStructure, word); |
|---|
| 819 | } else { |
|---|
| 820 | // boundary hit |
|---|
| 821 | // We're now on to the right paragraph's next w:t |
|---|
| 822 | pRightIndex++; |
|---|
| 823 | currentRightStructure = createRunStructure(word, |
|---|
| 824 | pr, pRightIndex ); |
|---|
| 825 | pRightReplacement.add(currentRightStructure); |
|---|
| 826 | } |
|---|
| 827 | } |
|---|
| 828 | |
|---|
| 829 | } |
|---|
| 830 | |
|---|
| 831 | } |
|---|
| 832 | |
|---|
| 833 | |
|---|
| 834 | org.docx4j.wml.P newLeftP = wmlFactory.createP(); |
|---|
| 835 | newLeftP.setPPr(pl.getPPr()); |
|---|
| 836 | newLeftP.getParagraphContent().addAll(pLeftReplacement); |
|---|
| 837 | |
|---|
| 838 | org.docx4j.wml.P newRightP = wmlFactory.createP(); |
|---|
| 839 | newRightP.setPPr(pr.getPPr()); |
|---|
| 840 | newRightP.getParagraphContent().addAll(pRightReplacement); |
|---|
| 841 | |
|---|
| 842 | log.debug("\n\n Left input \n\n" ); |
|---|
| 843 | log.debug(leftXmlOld) ; |
|---|
| 844 | |
|---|
| 845 | log.debug("\n\n New left side \n\n" ); |
|---|
| 846 | String leftXmlNew = org.docx4j.XmlUtils.marshaltoString(newLeftP, true, false); |
|---|
| 847 | log.debug(leftXmlNew) ; |
|---|
| 848 | |
|---|
| 849 | log.debug("\n\n Right input \n\n" ); |
|---|
| 850 | log.debug(rightXmlOld) ; |
|---|
| 851 | |
|---|
| 852 | log.debug("\n\n New right side \n\n" ); |
|---|
| 853 | String rightXmlNew = org.docx4j.XmlUtils.marshaltoString(newRightP, true, false); |
|---|
| 854 | log.debug(rightXmlNew) ; |
|---|
| 855 | |
|---|
| 856 | log.debug("\n\n Difference \n\n" ); |
|---|
| 857 | |
|---|
| 858 | String diffx = getDiffxOutput(leftXmlNew, rightXmlNew); |
|---|
| 859 | //String diffx = getDiffxOutput(rightXmlNew, leftXmlNew); |
|---|
| 860 | log.debug(diffx) ; |
|---|
| 861 | |
|---|
| 862 | log.info("\n\n <p> difference with pre-processing</p> \n\n" ); |
|---|
| 863 | try { |
|---|
| 864 | StreamSource src = new StreamSource(new StringReader(diffx)); |
|---|
| 865 | transformDiffxOutputToWml(result, author, date, docPartRelsLeft, |
|---|
| 866 | docPartRelsRight, src); |
|---|
| 867 | } catch (Exception exc) { |
|---|
| 868 | exc.printStackTrace(); |
|---|
| 869 | } |
|---|
| 870 | |
|---|
| 871 | |
|---|
| 872 | log.debug("\n\n Done!" ); |
|---|
| 873 | |
|---|
| 874 | } |
|---|
| 875 | |
|---|
| 876 | private static int sum( int[] array, int idx1, int idx2) { |
|---|
| 877 | |
|---|
| 878 | StringBuilder debug = new StringBuilder(); |
|---|
| 879 | |
|---|
| 880 | debug.append("{ "); |
|---|
| 881 | |
|---|
| 882 | int sum = 0; |
|---|
| 883 | |
|---|
| 884 | for (int i=idx1; i <= idx2; i++) { |
|---|
| 885 | debug.append( array[i] + ", "); |
|---|
| 886 | |
|---|
| 887 | sum+=array[i]; |
|---|
| 888 | } |
|---|
| 889 | debug.append("} = " + sum); |
|---|
| 890 | // System.out.println(debug); |
|---|
| 891 | return sum; |
|---|
| 892 | |
|---|
| 893 | } |
|---|
| 894 | |
|---|
| 895 | /** Add a word to a w:r's existing w:t */ |
|---|
| 896 | private static void addWord(R r, String word) { |
|---|
| 897 | |
|---|
| 898 | List runContent = r.getRunContent(); |
|---|
| 899 | |
|---|
| 900 | for (Object o2 : runContent ) { |
|---|
| 901 | |
|---|
| 902 | /* TODO - model assumes each w:r contains |
|---|
| 903 | only 1 w:t |
|---|
| 904 | |
|---|
| 905 | Check spec to see what the story is. |
|---|
| 906 | |
|---|
| 907 | */ |
|---|
| 908 | |
|---|
| 909 | boolean found = false; |
|---|
| 910 | |
|---|
| 911 | if (o2 instanceof org.docx4j.wml.Text) { |
|---|
| 912 | |
|---|
| 913 | if (found) { |
|---|
| 914 | log.debug("TODO: Handle multiple w:t in w:r!"); |
|---|
| 915 | } |
|---|
| 916 | |
|---|
| 917 | found = true; |
|---|
| 918 | |
|---|
| 919 | org.docx4j.wml.Text t = (org.docx4j.wml.Text)o2; |
|---|
| 920 | |
|---|
| 921 | String existingVal = t.getValue(); |
|---|
| 922 | |
|---|
| 923 | t.setValue(existingVal + " " + word); // TODO smarter handling of spaces |
|---|
| 924 | |
|---|
| 925 | } else { |
|---|
| 926 | log.debug(o2.getClass().getName()); |
|---|
| 927 | } |
|---|
| 928 | } |
|---|
| 929 | |
|---|
| 930 | |
|---|
| 931 | |
|---|
| 932 | } |
|---|
| 933 | |
|---|
| 934 | |
|---|
| 935 | private static org.docx4j.wml.R createRunStructure(String textVal, |
|---|
| 936 | P existingP, int rIndex ) { |
|---|
| 937 | |
|---|
| 938 | org.docx4j.wml.R newR = wmlFactory.createR(); |
|---|
| 939 | org.docx4j.wml.Text newT = wmlFactory.createText(); |
|---|
| 940 | newR.getRunContent().add(newT); |
|---|
| 941 | newT.setValue(textVal); |
|---|
| 942 | newT.setSpace("preserve"); |
|---|
| 943 | org.docx4j.wml.RPr existingRPr = ((org.docx4j.wml.R)existingP.getParagraphContent().get(rIndex)).getRPr(); |
|---|
| 944 | if ( existingRPr !=null ) |
|---|
| 945 | newR.setRPr(existingRPr); |
|---|
| 946 | return newR; |
|---|
| 947 | |
|---|
| 948 | } |
|---|
| 949 | |
|---|
| 950 | private static String toRangeString(StringComparator sc, int start, int length, boolean space) { |
|---|
| 951 | |
|---|
| 952 | // This method only exists for debug... |
|---|
| 953 | |
|---|
| 954 | StringBuilder result = new StringBuilder(); |
|---|
| 955 | for (int x=start; x<(start+length); x++) { |
|---|
| 956 | if (space) { |
|---|
| 957 | result.append(sc.getLeaf(x) + " "); |
|---|
| 958 | } else { |
|---|
| 959 | result.append(sc.getLeaf(x)); |
|---|
| 960 | } |
|---|
| 961 | } |
|---|
| 962 | return result.toString(); |
|---|
| 963 | } |
|---|
| 964 | |
|---|
| 965 | protected static org.docx4j.wml.P loadParagraph(String filename) throws Exception { |
|---|
| 966 | |
|---|
| 967 | java.io.File f = new java.io.File(filename); |
|---|
| 968 | java.io.InputStream is = new java.io.FileInputStream(f); |
|---|
| 969 | JAXBContext jc = org.docx4j.jaxb.Context.jc; |
|---|
| 970 | |
|---|
| 971 | Unmarshaller u = jc.createUnmarshaller(); |
|---|
| 972 | |
|---|
| 973 | //u.setSchema(org.docx4j.jaxb.WmlSchema.schema); |
|---|
| 974 | u.setEventHandler(new org.docx4j.jaxb.JaxbValidationEventHandler()); |
|---|
| 975 | |
|---|
| 976 | return (org.docx4j.wml.P)u.unmarshal( is ); |
|---|
| 977 | |
|---|
| 978 | |
|---|
| 979 | |
|---|
| 980 | } |
|---|
| 981 | |
|---|
| 982 | public static int[] getParagraphRunTextWordCounts(P p) { |
|---|
| 983 | |
|---|
| 984 | List<Object> children = p.getParagraphContent(); |
|---|
| 985 | |
|---|
| 986 | int i=0; |
|---|
| 987 | int[] result = new int[children.size()]; // one for each w:r |
|---|
| 988 | |
|---|
| 989 | for (Object o : children) { |
|---|
| 990 | |
|---|
| 991 | if ( o instanceof org.docx4j.wml.R ) { |
|---|
| 992 | |
|---|
| 993 | org.docx4j.wml.R r = (org.docx4j.wml.R)o; |
|---|
| 994 | List runContent = r.getRunContent(); |
|---|
| 995 | |
|---|
| 996 | result[i]=0; |
|---|
| 997 | |
|---|
| 998 | for (Object o2 : runContent ) { |
|---|
| 999 | |
|---|
| 1000 | /* TODO - model assumes each w:r contains |
|---|
| 1001 | only 1 w:t |
|---|
| 1002 | |
|---|
| 1003 | Check spec to see what the story is. |
|---|
| 1004 | |
|---|
| 1005 | */ |
|---|
| 1006 | |
|---|
| 1007 | boolean found = false; |
|---|
| 1008 | |
|---|
| 1009 | if (o2 instanceof javax.xml.bind.JAXBElement) { |
|---|
| 1010 | |
|---|
| 1011 | if (((JAXBElement) o2).getDeclaredType().getName().equals( |
|---|
| 1012 | "org.docx4j.wml.Text")) { |
|---|
| 1013 | |
|---|
| 1014 | if (found) { |
|---|
| 1015 | log.debug("TODO: Handle multiple w:t in w:r!"); |
|---|
| 1016 | } |
|---|
| 1017 | |
|---|
| 1018 | found = true; |
|---|
| 1019 | |
|---|
| 1020 | // System.out.println("Found Text"); |
|---|
| 1021 | org.docx4j.wml.Text t = (org.docx4j.wml.Text) ((JAXBElement) o2) |
|---|
| 1022 | .getValue(); |
|---|
| 1023 | |
|---|
| 1024 | result[i] = getWordCount( t.getValue() ); |
|---|
| 1025 | |
|---|
| 1026 | } else { |
|---|
| 1027 | log.debug(((JAXBElement) o2).getDeclaredType().getName()); |
|---|
| 1028 | } |
|---|
| 1029 | } else { |
|---|
| 1030 | log.debug(o2.getClass().getName()); |
|---|
| 1031 | } |
|---|
| 1032 | } |
|---|
| 1033 | |
|---|
| 1034 | i++; |
|---|
| 1035 | |
|---|
| 1036 | } else { |
|---|
| 1037 | log.debug("Encountered " + children.get(i).getClass().getName()); |
|---|
| 1038 | return null; |
|---|
| 1039 | |
|---|
| 1040 | } |
|---|
| 1041 | } |
|---|
| 1042 | |
|---|
| 1043 | return result; |
|---|
| 1044 | |
|---|
| 1045 | } |
|---|
| 1046 | |
|---|
| 1047 | |
|---|
| 1048 | private static int getWordCount(String sentence) { |
|---|
| 1049 | |
|---|
| 1050 | /* |
|---|
| 1051 | * Need to convert leading and trailing spaces |
|---|
| 1052 | * in order to get correct count. |
|---|
| 1053 | * |
|---|
| 1054 | * 'a' 1 |
|---|
| 1055 | ' a' 2 |
|---|
| 1056 | 'a ' 1 |
|---|
| 1057 | ' b ' 2 |
|---|
| 1058 | ' b c ' 3 |
|---|
| 1059 | 'b c' 2 |
|---|
| 1060 | 'b c' 3 <-- and also double spaces here |
|---|
| 1061 | |
|---|
| 1062 | * |
|---|
| 1063 | * trim takes care of leading and trailing. |
|---|
| 1064 | */ |
|---|
| 1065 | |
|---|
| 1066 | return sentence.trim().split("\\s").length; |
|---|
| 1067 | |
|---|
| 1068 | // TODO - handle cases of 2 spaces in a row, within the sentence |
|---|
| 1069 | // via an improved regex |
|---|
| 1070 | |
|---|
| 1071 | } |
|---|
| 1072 | |
|---|
| 1073 | |
|---|
| 1074 | public static String getRunString(org.docx4j.wml.P p, int i) { |
|---|
| 1075 | |
|---|
| 1076 | StringBuilder result = new StringBuilder(); |
|---|
| 1077 | |
|---|
| 1078 | List<Object> children = p.getParagraphContent(); |
|---|
| 1079 | |
|---|
| 1080 | if ( children.get(i) instanceof org.docx4j.wml.R ) { |
|---|
| 1081 | |
|---|
| 1082 | org.docx4j.wml.R r = (org.docx4j.wml.R)children.get(i); |
|---|
| 1083 | List runContent = r.getRunContent(); |
|---|
| 1084 | |
|---|
| 1085 | for (Object o2 : runContent ) { |
|---|
| 1086 | if (o2 instanceof javax.xml.bind.JAXBElement) { |
|---|
| 1087 | |
|---|
| 1088 | if (((JAXBElement) o2).getDeclaredType().getName().equals( |
|---|
| 1089 | "org.docx4j.wml.Text")) { |
|---|
| 1090 | // log.debug("Found Text"); |
|---|
| 1091 | org.docx4j.wml.Text t = (org.docx4j.wml.Text) ((JAXBElement) o2) |
|---|
| 1092 | .getValue(); |
|---|
| 1093 | result.append(t.getValue()); |
|---|
| 1094 | } else { |
|---|
| 1095 | log.debug(((JAXBElement) o2).getDeclaredType().getName()); |
|---|
| 1096 | } |
|---|
| 1097 | } else { |
|---|
| 1098 | log.debug(o2.getClass().getName()); |
|---|
| 1099 | } |
|---|
| 1100 | } |
|---|
| 1101 | |
|---|
| 1102 | } else { |
|---|
| 1103 | log.debug("Encountered " + children.get(i).getClass().getName()); |
|---|
| 1104 | return null; |
|---|
| 1105 | |
|---|
| 1106 | } |
|---|
| 1107 | |
|---|
| 1108 | return result.toString(); |
|---|
| 1109 | |
|---|
| 1110 | } |
|---|
| 1111 | |
|---|
| 1112 | |
|---|
| 1113 | private static String getDiffxOutput(String xml1, String xml2) { |
|---|
| 1114 | Reader xmlr1 = new StringReader(xml1); |
|---|
| 1115 | Reader xmlr2 = new StringReader(xml2); |
|---|
| 1116 | |
|---|
| 1117 | // output |
|---|
| 1118 | Writer out = new StringWriter(); |
|---|
| 1119 | |
|---|
| 1120 | DiffXConfig diffxConfig = new DiffXConfig(); |
|---|
| 1121 | diffxConfig.setIgnoreWhiteSpace(false); |
|---|
| 1122 | diffxConfig.setPreserveWhiteSpace(true); |
|---|
| 1123 | |
|---|
| 1124 | try { |
|---|
| 1125 | Main.diff(toNode(xmlr1, true), toNode(xmlr2, true), out, diffxConfig); |
|---|
| 1126 | // The signature which takes Reader objects appears to be broken |
|---|
| 1127 | out.close(); |
|---|
| 1128 | } catch (Exception exc) { |
|---|
| 1129 | exc.printStackTrace(); |
|---|
| 1130 | out = null; |
|---|
| 1131 | } |
|---|
| 1132 | |
|---|
| 1133 | return (out == null) ? null : out.toString(); |
|---|
| 1134 | } |
|---|
| 1135 | |
|---|
| 1136 | /** |
|---|
| 1137 | * Converts the reader to a node. |
|---|
| 1138 | * |
|---|
| 1139 | * @param xml |
|---|
| 1140 | * The reader on the XML. |
|---|
| 1141 | * @param isNSAware |
|---|
| 1142 | * Whether the factory should be namespace aware. |
|---|
| 1143 | * |
|---|
| 1144 | * @return The corresponding node. |
|---|
| 1145 | */ |
|---|
| 1146 | private static Node toNode(Reader xml, boolean isNSAware) { |
|---|
| 1147 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
|---|
| 1148 | factory.setNamespaceAware(isNSAware); |
|---|
| 1149 | try { |
|---|
| 1150 | DocumentBuilder builder = factory.newDocumentBuilder(); |
|---|
| 1151 | Document document = builder.parse(new InputSource(xml)); |
|---|
| 1152 | return document; |
|---|
| 1153 | } catch (Exception ex) { |
|---|
| 1154 | ex.printStackTrace(); |
|---|
| 1155 | } |
|---|
| 1156 | return null; |
|---|
| 1157 | } |
|---|
| 1158 | |
|---|
| 1159 | /* diffx treats each word as a token, and its output may |
|---|
| 1160 | * look like: |
|---|
| 1161 | * |
|---|
| 1162 | * <ins>Well,</ins><ins> </ins><ins>maybe</ins> |
|---|
| 1163 | * |
|---|
| 1164 | * This method will change that to: |
|---|
| 1165 | * |
|---|
| 1166 | * <ins>Well, maybe</ins> |
|---|
| 1167 | */ |
|---|
| 1168 | private static String combineAdjacent(XMLStreamReader reader) throws XMLStreamException |
|---|
| 1169 | { |
|---|
| 1170 | /* A more complex example: |
|---|
| 1171 | * |
|---|
| 1172 | <w:r><w:t>It<ins> </ins><ins>is</ins><ins> </ins><ins>good</ins> <ins>indeed</ins> |
|---|
| 1173 | <del>would</del> <ins>very</ins><del>be</del> good to read paragraph |
|---|
| 1174 | spacing<ins> </ins><ins>properly</ins><ins> </ins><ins>I</ins><ins> </ins><ins>would</ins> <ins>say.</ins><del>property.</del></w:t></w:r> |
|---|
| 1175 | |
|---|
| 1176 | becomes |
|---|
| 1177 | |
|---|
| 1178 | <w:r><w:t>It<ins> is good</ins> <ins>indeed</ins> |
|---|
| 1179 | <del>would</del> <ins>very</ins><del>be</del> good to read paragraph |
|---|
| 1180 | spacing<ins> properly I would</ins> <ins>say.</ins><del>property.</del></w:t></w:r> |
|---|
| 1181 | * |
|---|
| 1182 | * |
|---|
| 1183 | * |
|---|
| 1184 | */ |
|---|
| 1185 | |
|---|
| 1186 | String memory = null; |
|---|
| 1187 | |
|---|
| 1188 | |
|---|
| 1189 | // XmlWriterSettings settings = new XmlWriterSettings(); |
|---|
| 1190 | // settings.OmitXmlDeclaration = true; // important! |
|---|
| 1191 | // settings.Encoding = Encoding.UTF8; |
|---|
| 1192 | java.io.StringWriter stringWriter = new java.io.StringWriter(); |
|---|
| 1193 | |
|---|
| 1194 | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
|---|
| 1195 | XMLStreamWriter writer = factory.createXMLStreamWriter(stringWriter); |
|---|
| 1196 | |
|---|
| 1197 | while ( reader.hasNext() ) { |
|---|
| 1198 | |
|---|
| 1199 | int event = reader.next(); |
|---|
| 1200 | |
|---|
| 1201 | switch (event) { |
|---|
| 1202 | |
|---|
| 1203 | case XMLStreamConstants.END_ELEMENT: |
|---|
| 1204 | |
|---|
| 1205 | if (reader.getLocalName().equals("ins")) |
|---|
| 1206 | { |
|---|
| 1207 | memory = "ins"; |
|---|
| 1208 | // and don't write it |
|---|
| 1209 | // until we see what the next |
|---|
| 1210 | // element is |
|---|
| 1211 | } |
|---|
| 1212 | else if (reader.getLocalName().equals("del")) |
|---|
| 1213 | { |
|---|
| 1214 | memory = "del"; |
|---|
| 1215 | } |
|---|
| 1216 | else |
|---|
| 1217 | { |
|---|
| 1218 | writer.writeEndElement(); |
|---|
| 1219 | } |
|---|
| 1220 | break; |
|---|
| 1221 | |
|---|
| 1222 | |
|---|
| 1223 | case XMLStreamConstants.START_ELEMENT: |
|---|
| 1224 | |
|---|
| 1225 | try { |
|---|
| 1226 | if (memory != null) |
|---|
| 1227 | { |
|---|
| 1228 | // There is an </ins> (or </del>) we have just ignored |
|---|
| 1229 | |
|---|
| 1230 | if (memory.equals(reader.getLocalName())) |
|---|
| 1231 | { |
|---|
| 1232 | // Hit </ins><ins> |
|---|
| 1233 | // This is the case where |
|---|
| 1234 | // we don't write either of them ... |
|---|
| 1235 | memory = null; |
|---|
| 1236 | continue; |
|---|
| 1237 | } |
|---|
| 1238 | else |
|---|
| 1239 | { |
|---|
| 1240 | // This is a different node, |
|---|
| 1241 | // so write the </ins> |
|---|
| 1242 | writer.writeEndElement(); |
|---|
| 1243 | memory = null; |
|---|
| 1244 | } |
|---|
| 1245 | } |
|---|
| 1246 | if (reader.getNamespaceURI() == null ) { |
|---|
| 1247 | writer.writeStartElement(reader.getLocalName()); |
|---|
| 1248 | |
|---|
| 1249 | } else { |
|---|
| 1250 | writer.writeStartElement(reader.getPrefix(), reader.getLocalName(), reader.getNamespaceURI()); |
|---|
| 1251 | } |
|---|
| 1252 | for (int i=0; i<reader.getAttributeCount() ; i++ ) { |
|---|
| 1253 | |
|---|
| 1254 | if (reader.getAttributeNamespace(i)==null) { |
|---|
| 1255 | //log.debug("Writing " + reader.getLocalName() + "/@" + reader.getAttributeLocalName(i) ); |
|---|
| 1256 | writer.writeAttribute( |
|---|
| 1257 | reader.getAttributeLocalName(i), |
|---|
| 1258 | reader.getAttributeValue(i) ); |
|---|
| 1259 | } else { |
|---|
| 1260 | writer.writeAttribute( |
|---|
| 1261 | reader.getAttributePrefix(i), |
|---|
| 1262 | reader.getAttributeNamespace(i), |
|---|
| 1263 | reader.getAttributeLocalName(i), |
|---|
| 1264 | reader.getAttributeValue(i)); |
|---|
| 1265 | } |
|---|
| 1266 | } |
|---|
| 1267 | for (int i=0; i<reader.getNamespaceCount() ; i++ ) { |
|---|
| 1268 | writer.writeNamespace( |
|---|
| 1269 | reader.getNamespacePrefix(i), |
|---|
| 1270 | reader.getNamespaceURI(i) ); |
|---|
| 1271 | } |
|---|
| 1272 | } catch (XMLStreamException e) { |
|---|
| 1273 | log.error("Issue at element: " + reader.getLocalName() + "\n", e); |
|---|
| 1274 | throw e; |
|---|
| 1275 | } |
|---|
| 1276 | |
|---|
| 1277 | break; |
|---|
| 1278 | |
|---|
| 1279 | case XMLStreamConstants.CHARACTERS: |
|---|
| 1280 | if (memory != null) |
|---|
| 1281 | { |
|---|
| 1282 | // eg "</ins>HERE" |
|---|
| 1283 | writer.writeEndElement(); |
|---|
| 1284 | memory = null; |
|---|
| 1285 | } |
|---|
| 1286 | writer.writeCharacters(reader.getText()); |
|---|
| 1287 | break; |
|---|
| 1288 | |
|---|
| 1289 | |
|---|
| 1290 | case XMLStreamConstants.START_DOCUMENT: |
|---|
| 1291 | |
|---|
| 1292 | writer.writeStartDocument(); |
|---|
| 1293 | break; |
|---|
| 1294 | |
|---|
| 1295 | case XMLStreamConstants.END_DOCUMENT: |
|---|
| 1296 | |
|---|
| 1297 | writer.writeEndDocument(); |
|---|
| 1298 | break; |
|---|
| 1299 | |
|---|
| 1300 | default: |
|---|
| 1301 | |
|---|
| 1302 | // Ignore |
|---|
| 1303 | } |
|---|
| 1304 | |
|---|
| 1305 | } |
|---|
| 1306 | |
|---|
| 1307 | writer.flush(); |
|---|
| 1308 | writer.close(); |
|---|
| 1309 | |
|---|
| 1310 | return stringWriter.toString(); |
|---|
| 1311 | } |
|---|
| 1312 | |
|---|
| 1313 | /*String[] runtContents = "a".trim().split("\\s"); |
|---|
| 1314 | System.out.println( "'a' " + runtContents.length ); |
|---|
| 1315 | |
|---|
| 1316 | runtContents = " a".trim().split("\\s"); |
|---|
| 1317 | System.out.println( "' a' " + runtContents.length ); |
|---|
| 1318 | |
|---|
| 1319 | runtContents = "a ".trim().split("\\s"); |
|---|
| 1320 | System.out.println( "'a ' " + runtContents.length ); |
|---|
| 1321 | |
|---|
| 1322 | runtContents = " b ".trim().split("\\s"); |
|---|
| 1323 | System.out.println( "' b ' " + runtContents.length ); |
|---|
| 1324 | |
|---|
| 1325 | runtContents = " b c ".trim().split("\\s"); |
|---|
| 1326 | System.out.println( "' b c ' " + runtContents.length ); |
|---|
| 1327 | |
|---|
| 1328 | runtContents = "b c".trim().split("\\s"); |
|---|
| 1329 | System.out.println( "'b c' " + runtContents.length ); |
|---|
| 1330 | |
|---|
| 1331 | runtContents = "b c".trim().split("\\s"); |
|---|
| 1332 | System.out.println( "'b c' " + runtContents.length );*/ |
|---|
| 1333 | |
|---|
| 1334 | public static void main(String[] args) throws Exception { |
|---|
| 1335 | |
|---|
| 1336 | // Result format |
|---|
| 1337 | Writer diffxResult = new StringWriter(); |
|---|
| 1338 | |
|---|
| 1339 | // Run the diff |
|---|
| 1340 | try { |
|---|
| 1341 | |
|---|
| 1342 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); |
|---|
| 1343 | //java.io.InputStream is = new java.io.ByteArrayInputStream(naive.getBytes("UTF-8")); |
|---|
| 1344 | String simplified = combineAdjacent( |
|---|
| 1345 | inputFactory.createXMLStreamReader(new FileInputStream(new File("tmp_adj.xml"))) ); |
|---|
| 1346 | |
|---|
| 1347 | System.out.println("Done"); |
|---|
| 1348 | } catch (Exception exc) { |
|---|
| 1349 | exc.printStackTrace(); |
|---|
| 1350 | diffxResult = null; |
|---|
| 1351 | } |
|---|
| 1352 | } |
|---|
| 1353 | |
|---|
| 1354 | |
|---|
| 1355 | |
|---|
| 1356 | } |
|---|