Java Source Code: org.outerj.daisy.jspwiki_import.JspWikiImporter


   1: /*
   2:  * Copyright 2004 Outerthought bvba and Schaubroeck nv
   3:  *
   4:  * Licensed under the Apache License, Version 2.0 (the "License");
   5:  * you may not use this file except in compliance with the License.
   6:  * You may obtain a copy of the License at
   7:  *
   8:  *     http://www.apache.org/licenses/LICENSE-2.0
   9:  *
  10:  * Unless required by applicable law or agreed to in writing, software
  11:  * distributed under the License is distributed on an "AS IS" BASIS,
  12:  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13:  * See the License for the specific language governing permissions and
  14:  * limitations under the License.
  15:  */
  16: package org.outerj.daisy.jspwiki_import;
  17: 
  18: import org.apache.commons.httpclient.HttpClient;
  19: import org.apache.commons.httpclient.HttpMethod;
  20: import org.apache.commons.httpclient.HttpStatus;
  21: import org.apache.commons.httpclient.methods.GetMethod;
  22: import org.apache.xerces.parsers.DOMParser;
  23: import org.cyberneko.html.HTMLConfiguration;
  24: import org.xml.sax.*;
  25: import org.xml.sax.helpers.AttributesImpl;
  26: import org.jaxen.dom.DOMXPath;
  27: import org.w3c.dom.Element;
  28: import org.w3c.dom.NodeList;
  29: import org.w3c.dom.Node;
  30: import org.outerj.daisy.htmlcleaner.HtmlCleanerFactory;
  31: import org.outerj.daisy.htmlcleaner.HtmlCleanerTemplate;
  32: import org.outerj.daisy.htmlcleaner.HtmlCleaner;
  33: import org.outerj.daisy.repository.*;
  34: import org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager;
  35: 
  36: import javax.xml.transform.dom.DOMSource;
  37: import javax.xml.transform.Transformer;
  38: import javax.xml.transform.stream.StreamResult;
  39: import javax.xml.transform.sax.SAXTransformerFactory;
  40: import javax.xml.transform.sax.TransformerHandler;
  41: import javax.xml.transform.sax.SAXResult;
  42: import javax.xml.parsers.DocumentBuilderFactory;
  43: import javax.xml.parsers.DocumentBuilder;
  44: import javax.xml.parsers.SAXParserFactory;
  45: import javax.xml.parsers.SAXParser;
  46: import java.util.*;
  47: import java.io.*;
  48: import java.net.URLDecoder;
  49: 
  50: /**
  51:  * Standalone app to import contents of a JSP Wiki into daisy. Currently
  52:  * only written with the purpose of importing the Cocoon Wiki content to
  53:  * have some meaningful, and meaningful-sized testdata.
  54:  *
  55:  * <p>The import runs in two passes: first all wiki pages are imported
  56:  * into daisy, then links are translated from wiki page names to daisy
  57:  * document ids.
  58:  *
  59:  * <p>To run, after maven build, execute target/runimport.sh.
  60:  *
  61:  * <p>To make this usable as a generic utility, at least the hardcoded
  62:  * wiki location and daisy username, collection and url should be specifiable
  63:  * using command line parameters.
  64:  *
  65:  */
  66:	  public class JspWikiImporter {
  67:    private String wikiPageURL = "http://wiki.cocoondev.org/Wiki.jsp?page=";
  68:    private String collectionName = "cocoon";
  69:    private String daisyUser = "jspwiki-import";
  70:    private String daisyPassword = "topsecret";
  71:    private HashSet allPageNames = new HashSet();
  72:    private DocumentBuilder documentBuilder;
  73:    private HtmlCleanerTemplate htmlCleanerTemplate;
  74:    private SAXTransformerFactory transformerFactory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  75:    private Repository repository;
  76:    private HashMap importPages = new HashMap();
  77:    private HashMap importedImages = new HashMap();
  78:    private HashMap importedAttachments = new HashMap();
  79:    private DocumentCollection collection;
  80:    private static HashSet skipPages = new HashSet();
  81:	      static {
  82:        skipPages.add("UndefinedPages");
  83:        skipPages.add("UnusedPages");
  84:        skipPages.add("IndexPage");
  85:        skipPages.add("RecentChanges");
  86:        skipPages.add("FullRecentChanges");
  87:    }
  88:
  89:	      public static void main(String[] args) throws Exception {
  90:        new JspWikiImporter().run();
  91:    }
  92:
  93:	      public void run() throws Exception {
  94:        // initialize some stuff
  95:        System.out.println("Doing preparations...");
  96:        documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
  97:        File htmlCleanerConfig = new File("../daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml");
  98:        htmlCleanerTemplate = new HtmlCleanerFactory().buildTemplate(new InputSource(new FileInputStream(htmlCleanerConfig)));
  99:
 100:        // connect to daisy
 101:        System.out.println("Connecting to daisy...");
 102:        Credentials credentials = new Credentials(daisyUser, daisyPassword);
 103:        RepositoryManager repositoryManager = new RemoteRepositoryManager("http://localhost:9263", credentials);
 104:        repository = repositoryManager.getRepository(credentials);
 105:        collection = repository.getCollectionManager().getCollectionByName(collectionName, false);
 106:
 107:        // load wiki page names
 108:        System.out.println("Fetching list of all pages on the wiki...");
 109:        loadPageNames();
 110:        System.out.println(allPageNames.size() + " pages found on the wiki.");
 111:        System.out.println();
 112:
 113:        String[] pages = (String[])allPageNames.toArray(new String[allPageNames.size()]);
 114:	          for (int i = 0; i < pages.length; i++) {
 115:	              if (pages[i].startsWith("Wyona")) {
 116:                System.out.println("Skipping page " + pages[i]);
 117:            } else if (skipPages.contains(pages[i])) {
 118:                System.out.println("Skipping page " + pages[i]);
 119:            } else {
 120:                System.out.println("Fetching page " + pages[i] + "... (" + i + " of " + pages.length + ")");
 121:                byte[] pageData = fetchPage(pages[i]);
 122:
 123:                System.out.println("Parsing and cleaning HTML...");
 124:                Document pageDocument = parseHtml(pageData);
 125:                DOMXPath xpath = new DOMXPath("//div[@class='content']");
 126:                Element contentDiv = (Element)xpath.selectSingleNode(pageDocument);
 127:                if (contentDiv == null)
 128:                    throw new Exception("No content found in page " + pages[i]);
 129:                String contentData = serialize(contentDivToDoc(contentDiv));
 130:                byte[] cleanedContent = clean(contentData);
 131:
 132:                System.out.println("Storing page in Daisy...");
 133:                Document document = repository.createDocument(pages[i], "SimpleDocument");
 134:                document.setPart("SimpleDocumentContent", "text/xml", cleanedContent);
 135:                document.addToCollection(collection);
 136:                document.save();
 137:                importPages.put(pages[i], new Long(document.getId()));
 138:                System.out.println("Done\n");
 139:            }
 140:        }
 141:
 142:        System.out.println("\n\nWILL NOW START LINK TRANSLATION\n\n");
 143:
 144:        Iterator importPagesIt = importPages.entrySet().iterator();
 145:	          while (importPagesIt.hasNext()) {
 146:            Map.Entry entry = (Map.Entry)importPagesIt.next();
 147:            String pageName = (String)entry.getKey();
 148:            long pageId = ((Long)entry.getValue()).longValue();
 149:
 150:            System.out.println("Translating links for document " + pageName + "...");
 151:            Document document = repository.getDocument(pageId, true);
 152:            byte[] pageData = document.getPart("SimpleDocumentContent").getData();
 153:            byte[] newData = clean(translateLinks(pageData));
 154:            document.setPart("SimpleDocumentContent", "text/xml", newData);
 155:            document.save();
 156:            System.out.println("Done\n");
 157:        }
 158:
 159:    }
 160:
 161:	      private byte[] clean(String htmlData) throws Exception {
 162:        HtmlCleaner cleaner = htmlCleanerTemplate.newHtmlCleaner();
 163:        return cleaner.cleanToByteArray(htmlData);
 164:    }
 165:
 166:	      private Document contentDivToDoc(Element contentDiv) {
 167:        Document doc = documentBuilder.newDocument();
 168:        Element htmlEl = doc.createElementNS(null, "html");
 169:        doc.appendChild(htmlEl);
 170:        Element bodyEl = doc.createElementNS(null, "body");
 171:        htmlEl.appendChild(bodyEl);
 172:        NodeList childNodes = contentDiv.getChildNodes();
 173:	          for (int i = 0; i < childNodes.getLength(); i++) {
 174:            Node node = childNodes.item(i);
 175:            boolean append = true;
 176:	              if (node instanceof Element && node.getLocalName().equals("h1")) {
 177:                Element divEl = (Element)node;
 178:	                  if (divEl.getAttribute("class").equals("pagename")) {
 179:                    append = false;
 180:                }
 181:            } else if (node instanceof Element && node.getLocalName().equals("div")) {
 182:                Element divEl = (Element)node;
 183:                // detect end of content by presence of a div with class bottom.
 184:	                  if (divEl.getAttribute("class").equals("bottom")) {
 185:                    return doc;
 186:                }
 187:            }
 188:            if (append)
 189:                bodyEl.appendChild(doc.importNode(node, true));
 190:        }
 191:        return doc;
 192:    }
 193:
 194:	      private String serialize(Document doc) throws Exception {
 195:        TransformerHandler serializer = transformerFactory.newTransformerHandler();
 196:        StringWriter writer = new StringWriter();
 197:        serializer.setResult(new StreamResult(writer));
 198:
 199:        Transformer streamer = transformerFactory.newTransformer();
 200:        streamer.transform(new DOMSource(doc), new SAXResult(new ExtraCleanup(serializer)));
 201:        return writer.toString();
 202:    }
 203:
 204:	      private void loadPageNames() throws Exception {
 205:        byte[] indexPageData = fetchPage("IndexPage");
 206:        Document document = parseHtml(indexPageData);
 207:        DOMXPath xpath = new DOMXPath("//a[@class='wikipage']");
 208:        List nodes = xpath.selectNodes(document);
 209:        Iterator nodesIt = nodes.iterator();
 210:	          while (nodesIt.hasNext()) {
 211:            Element element = (Element)nodesIt.next();
 212:            String href = element.getAttribute("href");
 213:            if (href.startsWith(wikiPageURL))
 214:                allPageNames.add(href.substring(wikiPageURL.length()));
 215:        }
 216:    }
 217:
 218:	      private byte[] fetchPage(String pageName) throws Exception {
 219:        HttpClient client = new HttpClient();
 220:        HttpMethod method = new GetMethod(wikiPageURL + pageName);
 221:        int status = client.executeMethod(method);
 222:        if (status != HttpStatus.SC_OK)
 223:            throw new Exception("Problem retrieving wiki page " + pageName + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
 224:        return method.getResponseBody();
 225:    }
 226:
 227:	      private Document parseHtml(byte[] data) throws Exception {
 228:        DOMParser parser = new DOMParser(new HTMLConfiguration());
 229:        parser.setFeature("http://xml.org/sax/features/namespaces", true);
 230:        parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false);
 231:        parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false);
 232:        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
 233:        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
 234:
 235:        parser.parse(new InputSource(new ByteArrayInputStream(data)));
 236:        return parser.getDocument();
 237:    }
 238:
 239:	      private String translateLinks(byte[] data) throws Exception {
 240:        TransformerHandler serializer = transformerFactory.newTransformerHandler();
 241:        StringWriter writer = new StringWriter();
 242:        serializer.setResult(new StreamResult(writer));
 243:
 244:        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
 245:        parserFactory.setNamespaceAware(true);
 246:        SAXParser parser = parserFactory.newSAXParser();
 247:        parser.getXMLReader().setContentHandler(new LinkTranslator(serializer));
 248:        parser.getXMLReader().parse(new InputSource(new ByteArrayInputStream(data)));
 249:
 250:        return writer.toString();
 251:    }
 252:
 253:	      class AbstractTransformer implements ContentHandler {
 254:        protected ContentHandler consumer;
 255:
 256:	          public AbstractTransformer(ContentHandler consumer) {
 257:            this.consumer = consumer;
 258:        }
 259:
 260:        public void endDocument()
 261:	          throws SAXException {
 262:            consumer.endDocument();
 263:        }
 264:
 265:        public void startDocument ()
 266:	          throws SAXException {
 267:            consumer.startDocument();
 268:        }
 269:
 270:        public void characters (char ch[], int start, int length)
 271:	          throws SAXException {
 272:            consumer.characters(ch, start, length);
 273:        }
 274:
 275:        public void ignorableWhitespace (char ch[], int start, int length)
 276:	          throws SAXException {
 277:            consumer.ignorableWhitespace(ch, start, length);
 278:        }
 279:
 280:        public void endPrefixMapping (String prefix)
 281:	          throws SAXException {
 282:            consumer.endPrefixMapping(prefix);
 283:        }
 284:
 285:        public void skippedEntity (String name)
 286:	          throws SAXException {
 287:            consumer.skippedEntity(name);
 288:        }
 289:
 290:	          public void setDocumentLocator (Locator locator) {
 291:            consumer.setDocumentLocator(locator);
 292:        }
 293:
 294:        public void processingInstruction (String target, String data)
 295:	          throws SAXException {
 296:            consumer.processingInstruction(target, data);
 297:        }
 298:
 299:        public void startPrefixMapping (String prefix, String uri)
 300:	          throws SAXException {
 301:            consumer.startPrefixMapping(prefix, uri);
 302:        }
 303:
 304:        public void endElement (String namespaceURI, String localName,
 305:                    String qName)
 306:	          throws SAXException {
 307:            consumer.endElement(namespaceURI, localName, qName);
 308:        }
 309:
 310:        public void startElement (String namespaceURI, String localName,
 311:                      String qName, Attributes atts)
 312:	          throws SAXException {
 313:            consumer.startElement(namespaceURI, localName, qName, atts);
 314:        }
 315:    }
 316:
 317:	      class LinkTranslator extends AbstractTransformer {
 318:
 319:	          public LinkTranslator(ContentHandler consumer) {
 320:            super(consumer);
 321:        }
 322:
 323:	          public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
 324:	              if (uri.equals("") && localName.equals("a")) {
 325:                int index = attributes.getIndex("href");
 326:                String href = (index != -1 ? attributes.getValue(index) : null);
 327:	                  if (href != null && href.startsWith(wikiPageURL)) {
 328:                    String linkedPage = href.substring(wikiPageURL.length());
 329:                    Long linkedPageId = (Long)importPages.get(linkedPage);
 330:                    System.out.println("attempt translation of " + linkedPage + " to " + linkedPageId);
 331:	                      if (linkedPageId != null) {
 332:                        AttributesImpl newAttrs = new AttributesImpl(attributes);
 333:                        newAttrs.setAttribute(newAttrs.getIndex("href"), "", "href", "href", "CDATA", "daisy:" + linkedPageId.longValue());
 334:                        attributes = newAttrs;
 335:                    }
 336:                }
 337:            }
 338:            consumer.startElement(uri, localName, qName, attributes);
 339:        }
 340:    }
 341:
 342:	      class ExtraCleanup extends AbstractTransformer {
 343:        private boolean dropNextImgEndTag = false;
 344:
 345:	          public ExtraCleanup(ContentHandler consumer) {
 346:            super(consumer);
 347:        }
 348:
 349:	          public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
 350:	              if (namespaceURI.equals("") && localName.equals("img") && ("http://wiki.cocoondev.org/images/out.png".equals(atts.getValue("src")) || "images/attachment_small.png".equals(atts.getValue("src")))) {
 351:                dropNextImgEndTag = true;
 352:                // skip element
 353:            } else if (namespaceURI.equals("") && localName.equals("img")) {
 354:                String src = atts.getValue("src");
 355:	                  if (src != null) {
 356:	                      if (importedImages.containsKey(src)) {
 357:                        AttributesImpl newAttrs = new AttributesImpl();
 358:                        newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedImages.get(src));
 359:                    } else {
 360:	                          try {
 361:                            HttpClient client = new HttpClient();
 362:                            HttpMethod method = new GetMethod(src);
 363:                            int status = client.executeMethod(method);
 364:	                              if (status >= 300 && status < 400) {
 365:                                method = new GetMethod(method.getResponseHeader("location").getValue());
 366:                                status = client.executeMethod(method);
 367:                            }
 368:                            if (status != HttpStatus.SC_OK)
 369:                                throw new Exception("Problem retrieving image " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
 370:                            byte[] data = method.getResponseBody();
 371:                            String name = getImageName(src);
 372:                            Document imageDocument = repository.createDocument(name, "Image");
 373:                            imageDocument.setPart("ImageData", method.getResponseHeader("Content-Type").getValue(), data);
 374:                            imageDocument.addToCollection(collection);
 375:                            imageDocument.save();
 376:                            importedImages.put(src, String.valueOf(imageDocument.getId()));
 377:                            AttributesImpl newAttrs = new AttributesImpl();
 378:                            newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + imageDocument.getId());
 379:                            super.startElement("", "img", "img", newAttrs);
 380:                            System.out.println("Imported image " + src + " as " + name);
 381:                        } catch (Exception e) {
 382:                            throw new SAXException("Error getting image " + src, e);
 383:                        }
 384:                    }
 385:                }
 386:            } else if (namespaceURI.equals("") && localName.equals("a") && "attachment".equals(atts.getValue("class"))) {
 387:                String src = atts.getValue("href");
 388:                String decodedSrc = null;
 389:	                  try {
 390:                    decodedSrc = URLDecoder.decode(src, "UTF-8");
 391:                } catch (UnsupportedEncodingException e) {
 392:                    throw new SAXException(e);
 393:                }
 394:	                  if (importedAttachments.containsKey(src)) {
 395:                    AttributesImpl newAttrs = new AttributesImpl();
 396:                    newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedAttachments.get(src));
 397:                } else {
 398:	                      try {
 399:                        HttpClient client = new HttpClient();
 400:                        HttpMethod method = new GetMethod(src);
 401:                        int status = client.executeMethod(method);
 402:                        if (status != HttpStatus.SC_OK)
 403:                            throw new Exception("Problem retrieving attachment " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
 404:                        byte[] data = method.getResponseBody();
 405:                        String name = getImageName(decodedSrc);
 406:                        Document attachmentDocument = repository.createDocument(name, "Attachment");
 407:                        attachmentDocument.setPart("AttachmentData", method.getResponseHeader("Content-Type").getValue(), data);
 408:                        attachmentDocument.addToCollection(collection);
 409:                        attachmentDocument.save();
 410:                        importedAttachments.put(src, String.valueOf(attachmentDocument.getId()));
 411:                        AttributesImpl newAttrs = new AttributesImpl();
 412:                        newAttrs.addAttribute("", "href", "href", "CDATA", "daisy:" + attachmentDocument.getId());
 413:                        super.startElement("", "a", "a", newAttrs);
 414:                        System.out.println("Imported attachment " + src + " as " + name);
 415:                    } catch (Exception e) {
 416:                        throw new SAXException("Error getting attachment " + src, e);
 417:                    }
 418:                }
 419:            } else {
 420:                super.startElement(namespaceURI, localName, qName, atts);
 421:            }
 422:        }
 423:
 424:	          private String getImageName(String src) {
 425:            String name = src.substring(src.lastIndexOf('/') + 1);
 426:            int dotpos = name.lastIndexOf('.');
 427:	              if (dotpos != -1) {
 428:                name = name.substring(0, dotpos);
 429:            }
 430:            return name;
 431:        }
 432:
 433:	          public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
 434:	              if (dropNextImgEndTag && namespaceURI.equals("") && localName.equals("img")) {
 435:                // skip
 436:                dropNextImgEndTag = false;
 437:                // note that this code assumes img elements are never nested.
 438:            } else {
 439:                super.endElement(namespaceURI, localName, qName);
 440:            }
 441:        }
 442:    }
 443:}