Java Source Code: org.archive.crawler.frontier.AdaptiveRevisitFrontier


   1: /* AdaptiveRevisitFrontier.java
   2: *
   3: * Created on Sep 13, 2004
   4: *
   5: * Copyright (C) 2004 Kristinn Sigur?sson.
   6: *
   7: * This file is part of the Heritrix web crawler (crawler.archive.org).
   8: *
   9: * Heritrix is free software; you can redistribute it and/or modify
  10: * it under the terms of the GNU Lesser Public License as published by
  11: * the Free Software Foundation; either version 2.1 of the License, or
  12: * any later version.
  13: *
  14: * Heritrix is distributed in the hope that it will be useful,
  15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17: * GNU Lesser Public License for more details.
  18: *
  19: * You should have received a copy of the GNU Lesser Public License
  20: * along with Heritrix; if not, write to the Free Software
  21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  22: */
  23: package org.archive.crawler.frontier;
  24: 
  25: import java.io.File;
  26: import java.io.IOException;
  27: import java.io.PrintWriter;
  28: import java.io.Serializable;
  29: import java.io.StringWriter;
  30: import java.io.Writer;
  31: import java.util.ArrayList;
  32: import java.util.Date;
  33: import java.util.Iterator;
  34: import java.util.List;
  35: import java.util.logging.Level;
  36: import java.util.logging.Logger;
  37: 
  38: import javax.management.AttributeNotFoundException;
  39: 
  40: import org.apache.commons.httpclient.HttpStatus;
  41: import org.archive.crawler.datamodel.CandidateURI;
  42: import org.archive.crawler.datamodel.CoreAttributeConstants;
  43: import org.archive.crawler.datamodel.CrawlServer;
  44: import org.archive.crawler.datamodel.CrawlURI;
  45: import org.archive.crawler.datamodel.FetchStatusCodes;
  46: import org.archive.crawler.datamodel.UriUniqFilter;
  47: import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
  48: import org.archive.crawler.event.CrawlStatusListener;
  49: import org.archive.crawler.framework.CrawlController;
  50: import org.archive.crawler.framework.Frontier;
  51: import org.archive.crawler.framework.FrontierMarker;
  52: import org.archive.crawler.framework.exceptions.EndedException;
  53: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
  54: import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
  55: import org.archive.crawler.settings.ModuleType;
  56: import org.archive.crawler.settings.RegularExpressionConstraint;
  57: import org.archive.crawler.settings.SimpleType;
  58: import org.archive.crawler.settings.Type;
  59: import org.archive.crawler.url.Canonicalizer;
  60: import org.archive.crawler.util.BdbUriUniqFilter;
  61: import org.archive.net.UURI;
  62: import org.archive.queue.MemQueue;
  63: import org.archive.queue.Queue;
  64: import org.archive.util.ArchiveUtils;
  65: 
  66: 
  67: /**
  68:  * A Frontier that will repeatedly visit all encountered URIs. 
  69:  * <p>
  70:  * Wait time between visits is configurable and varies based on observed 
  71:  * changes of documents.
  72:  * <p>
  73:  * The Frontier borrows many things from HostQueuesFrontier, but implements 
  74:  * an entirely different strategy in issuing URIs and consequently in keeping a
  75:  * record of discovered URIs.
  76:  *
  77:  * @author Kristinn Sigurdsson
  78:  */
  79: public class AdaptiveRevisitFrontier extends ModuleType 
  80: implements Frontier, FetchStatusCodes, CoreAttributeConstants,
  81:	          AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {
  82:
  83:    private static final long serialVersionUID = -8666872690438543671L;
  84:
  85:    private static final Logger logger =
  86:        Logger.getLogger(AdaptiveRevisitFrontier.class.getName());
  87:
  88:    /** How many multiples of last fetch elapsed time to wait before recontacting
  89:     * same server */
  90:    public final static String ATTR_DELAY_FACTOR = "delay-factor";
  91:    private final static Float DEFAULT_DELAY_FACTOR = new Float(5);
  92:    
  93:    /** Always wait this long after one completion before recontacting
  94:     * same server, regardless of multiple */
  95:    public final static String ATTR_MIN_DELAY = "min-delay-ms";
  96:
  97:    // 2 seconds
  98:    private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);
  99:    
 100:    /** Never wait more than this long, regardless of multiple */
 101:    public final static String ATTR_MAX_DELAY = "max-delay-ms";
 102:    
 103:    // 30 seconds
 104:    private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
 105:    
 106:    /** Maximum times to emit a CrawlURI without final disposition */
 107:    public final static String ATTR_MAX_RETRIES = "max-retries";
 108:    private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
 109:
 110:    /** For retryable problems, seconds to wait before a retry */
 111:    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
 112:    
 113:    // 15 minutes
 114:    private final static Long DEFAULT_RETRY_DELAY = new Long(900);
 115:    
 116:    /** Maximum simultaneous requests in process to a host (queue) */
 117:    public final static String ATTR_HOST_VALENCE = "host-valence";
 118:    private final static Integer DEFAULT_HOST_VALENCE = new Integer(1); 
 119:
 120:    /** Number of hops of embeds (ERX) to bump to front of host queue */
 121:    public final static String ATTR_PREFERENCE_EMBED_HOPS =
 122:        "preference-embed-hops";
 123:    private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(0); 
 124:    
 125:    /** Queue assignment to force on CrawlURIs. Intended to be used 
 126:     *  via overrides*/
 127:    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
 128:    protected final static String DEFAULT_FORCE_QUEUE = "";
 129:    /** Acceptable characters in forced queue names.
 130:     *  Word chars, dash, period, comma, colon */
 131:    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";
 132:
 133:    /** Should the queue assignment ignore www in hostnames, effectively 
 134:     *  stripping them away. 
 135:     */
 136:    public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
 137:    protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(false);
 138:    
 139:    /** Should the Frontier use a seperate 'already included' datastructure
 140:     *  or rely on the queues'. 
 141:     */
 142:    public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
 143:    protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(false);
 144:    
 145:    private CrawlController controller;
 146:    
 147:    private AdaptiveRevisitQueueList hostQueues;
 148:    
 149:    private UriUniqFilter alreadyIncluded;
 150:
 151:    private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();
 152:
 153:    /** Policy for assigning CrawlURIs to named queues */
 154:    private QueueAssignmentPolicy queueAssignmentPolicy = null;
 155:    
 156:    // top-level stats
 157:    private long succeededFetchCount = 0;
 158:    private long failedFetchCount = 0;
 159:    // URI's that are disregarded (for example because of robot.txt rules)
 160:    private long disregardedUriCount = 0;
 161:
 162:    private long totalProcessedBytes = 0;
 163:    
 164:    // Flags indicating operator-specified crawl pause/end 
 165:    private boolean shouldPause = false;
 166:    private boolean shouldTerminate = false;
 167:    
 168:
 169:	      public AdaptiveRevisitFrontier(String name) {
 170:        this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " +
 171:                "will repeatedly visit all " +
 172:                "encountered URIs. Wait time between visits is configurable" +
 173:                " and is determined by seperate Processor(s). See " +
 174:                "WaitEvaluators " +
 175:                "See documentation for ARFrontier limitations.");        
 176:    }
 177:
 178:	      public AdaptiveRevisitFrontier(String name, String description) {
 179:        super(Frontier.ATTR_NAME, description);
 180:        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
 181:                "How many multiples of last fetch elapsed time to wait before " +
 182:                "recontacting same server", DEFAULT_DELAY_FACTOR));
 183:            addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
 184:                "Never wait more than this long, regardless of multiple",
 185:                DEFAULT_MAX_DELAY));
 186:            addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
 187:                "Always wait this long after one completion before recontacting " +
 188:                "same server, regardless of multiple", DEFAULT_MIN_DELAY));
 189:             addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
 190:                "How often to retry fetching a URI that failed to be retrieved.\n" +
 191:                "If zero, the crawler will get the robots.txt only.",
 192:                DEFAULT_MAX_RETRIES));
 193:            addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
 194:                    "How long to wait by default until we retry fetching a" +
 195:                    " URI that failed to be retrieved (seconds). ",
 196:                    DEFAULT_RETRY_DELAY));
 197:            addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
 198:                    "Number of embedded (or redirected) hops up to which " +
 199:                    "a URI has higher priority scheduling. For example, if set " +
 200:                    "to 1 (the default), items such as inline images (1-hop " +
 201:                    "embedded resources) will be scheduled ahead of all regular " +
 202:                    "links (or many-hop resources, like nested frames). If set to " +
 203:                    "zero, no preferencing will occur, and embeds/redirects are " +
 204:                    "scheduled the same as regular links.",
 205:                    DEFAULT_PREFERENCE_EMBED_HOPS));
 206:            Type t;
 207:            t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
 208:                    "Maximum number of simultaneous requests to a single" +
 209:                    " host.",
 210:                    DEFAULT_HOST_VALENCE));
 211:            t.setExpertSetting(true);
 212:            t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW,
 213:                    "If true then documents from x.com, www.x.com and any " +
 214:                    "www[0-9]+.x.com will be assigned to the same queue.",
 215:                    DEFAULT_QUEUE_IGNORE_WWW));
 216:            t.setExpertSetting(true);
 217:            t = addElementToDefinition(new SimpleType(
 218:                    ATTR_FORCE_QUEUE,
 219:                    "The queue name into which to force URIs. Should "
 220:                    + "be left blank at global level.  Specify a "
 221:                    + "per-domain/per-host override to force URIs into "
 222:                    + "a particular named queue, regardless of the assignment "
 223:                    + "policy in effect (domain or ip-based politeness). "
 224:                    + "This could be used on domains known to all be from "
 225:                    + "the same small set of IPs (eg blogspot, dailykos, etc.) "
 226:                    + "to simulate IP-based politeness, or it could be used if "
 227:                    + "you wanted to enforce politeness over a whole domain, even "
 228:                    + "though the subdomains are split across many IPs.",
 229:                    DEFAULT_FORCE_QUEUE));
 230:            t.setOverrideable(true);
 231:            t.setExpertSetting(true);
 232:            t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
 233:                    Level.WARNING, "This field must contain only alphanumeric "
 234:                    + "characters plus period, dash, comma, colon, or underscore."));
 235:            t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
 236:                    "If true then the Frontier will use a seperate " +
 237:                    "datastructure to detect and eliminate duplicates.\n" +
 238:                    "This is required for Canonicalization rules to work.",
 239:                    DEFAULT_USE_URI_UNIQ_FILTER));
 240:            t.setExpertSetting(true);
 241:            t.setOverrideable(false);
 242:
 243:        // Register persistent CrawlURI items 
 244:        CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
 245:        CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
 246:    }
 247:
 248:    public synchronized void initialize(CrawlController c)
 249:	              throws FatalConfigurationException, IOException {
 250:        controller = c;
 251:        controller.addCrawlStatusListener(this);
 252:
 253:        queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();
 254:        
 255:        hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(),
 256:            c.getClassCatalog());
 257:        
 258:        if(((Boolean)getUncheckedAttribute(
 259:	                  null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){
 260:            alreadyIncluded = createAlreadyIncluded();
 261:        } else {
 262:            alreadyIncluded = null;
 263:        }
 264:        
 265:        loadSeeds();
 266:    }
 267:
 268:    /**
 269:     * Create a UriUniqFilter that will serve as record 
 270:     * of already seen URIs.
 271:     *
 272:     * @return A UURISet that will serve as a record of already seen URIs
 273:     * @throws IOException
 274:     */
 275:	      protected UriUniqFilter createAlreadyIncluded() throws IOException {
 276:        UriUniqFilter uuf = new BdbUriUniqFilter(
 277:                this.controller.getBdbEnvironment());
 278:        uuf.setDestination(this);
 279:        return uuf;
 280:    }
 281:    
 282:    /**
 283:     * Loads the seeds
 284:     * <p>
 285:     * This method is called by initialize() and kickUpdate()
 286:     */
 287:	      public void loadSeeds() {
 288:        Writer ignoredWriter = new StringWriter();
 289:        // Get the seeds to refresh.
 290:        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
 291:	          while (iter.hasNext()) {
 292:            CandidateURI caUri =
 293:                CandidateURI.createSeedCandidateURI((UURI)iter.next());
 294:            caUri.setSchedulingDirective(CandidateURI.MEDIUM);
 295:            schedule(caUri);
 296:        }
 297:        batchFlush();
 298:        // save ignored items (if any) where they can be consulted later
 299:        AbstractFrontier.saveIgnoredItems(
 300:                ignoredWriter.toString(), 
 301:                controller.getDisk());
 302:    }
 303:    
 304:	      public String getClassKey(CandidateURI cauri) {
 305:        String queueKey = (String)getUncheckedAttribute(cauri,
 306:                ATTR_FORCE_QUEUE);
 307:	              if ("".equals(queueKey)) {
 308:                // Typical case, barring overrides
 309:                queueKey =
 310:                    queueAssignmentPolicy.getClassKey(controller, cauri);
 311:                // The queueAssignmentPolicy is always based on Hostnames
 312:                // We may need to remove any www[0-9]{0,}\. prefixes from the
 313:                // hostnames
 314:                if(((Boolean)getUncheckedAttribute(
 315:	                          cauri,ATTR_QUEUE_IGNORE_WWW)).booleanValue()){
 316:                    queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.","");
 317:                }
 318:            }
 319:            return queueKey;
 320:    }
 321:
 322:    /**
 323:     * Canonicalize passed uuri. Its would be sweeter if this canonicalize
 324:     * function was encapsulated by that which it canonicalizes but because
 325:     * settings change with context -- i.e. there may be overrides in operation
 326:     * for a particular URI -- its not so easy; Each CandidateURI would need a
 327:     * reference to the settings system. That's awkward to pass in.
 328:     * 
 329:     * @param uuri Candidate URI to canonicalize.
 330:     * @return Canonicalized version of passed <code>uuri</code>.
 331:     */
 332:	      protected String canonicalize(UURI uuri) {
 333:        return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
 334:    }
 335:
 336:    /**
 337:     * Canonicalize passed CandidateURI. This method differs from
 338:     * {@link #canonicalize(UURI)} in that it takes a look at
 339:     * the CandidateURI context possibly overriding any canonicalization effect if
 340:     * it could make us miss content. If canonicalization produces an URL that
 341:     * was 'alreadyseen', but the entry in the 'alreadyseen' database did
 342:     * nothing but redirect to the current URL, we won't get the current URL;
 343:     * we'll think we've already see it. Examples would be archive.org
 344:     * redirecting to www.archive.org or the inverse, www.netarkivet.net
 345:     * redirecting to netarkivet.net (assuming stripWWW rule enabled).
 346:     * <p>Note, this method under circumstance sets the forceFetch flag.
 347:     * 
 348:     * @param cauri CandidateURI to examine.
 349:     * @return Canonicalized <code>cacuri</code>.
 350:     */
 351:	      protected String canonicalize(CandidateURI cauri) {
 352:        String canon = canonicalize(cauri.getUURI());
 353:	          if (cauri.isLocation()) {
 354:            // If the via is not the same as where we're being redirected (i.e.
 355:            // we're not being redirected back to the same page, AND the
 356:            // canonicalization of the via is equal to the the current cauri, 
 357:            // THEN forcefetch (Forcefetch so no chance of our not crawling
 358:            // content because alreadyseen check things its seen the url before.
 359:            // An example of an URL that redirects to itself is:
 360:            // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
 361:            // An example of an URL whose canonicalization equals its via's
 362:            // canonicalization, and we want to fetch content at the
 363:            // redirection (i.e. need to set forcefetch), is netarkivet.dk.
 364:            if (!cauri.toString().equals(cauri.getVia().toString()) &&
 365:	                      canonicalize(cauri.getVia()).equals(canon)) {
 366:                cauri.setForceFetch(true);
 367:            }
 368:        }
 369:        return canon;
 370:    }
 371:
 372:    /**
 373:     * 
 374:     * @param caUri The URI to schedule.
 375:     */
 376:	      protected void innerSchedule(CandidateURI caUri) {
 377:        CrawlURI curi;
 378:	          if(caUri instanceof CrawlURI) {
 379:            curi = (CrawlURI) caUri;
 380:        } else {
 381:            curi = CrawlURI.from(caUri,System.currentTimeMillis());
 382:            curi.putLong(A_TIME_OF_NEXT_PROCESSING,
 383:                System.currentTimeMillis());
 384:            // New CrawlURIs get 'current time' as the time of next processing.
 385:        }
 386:        
 387:	          if(curi.getClassKey() == null){
 388:            curi.setClassKey(getClassKey(curi));
 389:        }
 390:
 391:        if(curi.isSeed() && curi.getVia() != null
 392:	                  && curi.flattenVia().length() > 0) {
 393:            // The only way a seed can have a non-empty via is if it is the
 394:            // result of a seed redirect.  Add it to the seeds list.
 395:            //
 396:            // This is a feature.  This is handling for case where a seed
 397:            // gets immediately redirected to another page.  What we're doing
 398:            // is treating the immediate redirect target as a seed.
 399:            this.controller.getScope().addSeed(curi);
 400:            // And it needs rapid scheduling.
 401:            curi.setSchedulingDirective(CandidateURI.MEDIUM);
 402:        }
 403:        
 404:        // Optionally preferencing embeds up to MEDIUM
 405:        int prefHops = ((Integer) getUncheckedAttribute(curi,
 406:                ATTR_PREFERENCE_EMBED_HOPS)).intValue();
 407:        boolean prefEmbed = false;
 408:	          if (prefHops > 0) {
 409:            int embedHops = curi.getTransHops();
 410:            if (embedHops > 0 && embedHops <= prefHops
 411:	                      && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
 412:                // number of embed hops falls within the preferenced range, and
 413:                // uri is not already MEDIUM -- so promote it
 414:                curi.setSchedulingDirective(CandidateURI.MEDIUM);
 415:                prefEmbed = true;
 416:            }
 417:        }
 418:
 419:        // Finally, allow curi to be fetched right now 
 420:        // (while not overriding overdue items)
 421:        curi.putLong(A_TIME_OF_NEXT_PROCESSING,
 422:                System.currentTimeMillis());
 423:        
 424:	          try {
 425:            logger.finest("scheduling " + curi.toString());
 426:            AdaptiveRevisitHostQueue hq = getHQ(curi);
 427:            hq.add(curi,prefEmbed);
 428:        } catch (IOException e) {
 429:            // TODO Handle IOExceptions
 430:            e.printStackTrace();
 431:        }
 432:        
 433:    }
 434:
 435:    /**
 436:     * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
 437:     * it if necessary. 
 438:     * 
 439:     * @param curi CrawlURI for which to get a queue
 440:     * @return AdaptiveRevisitHostQueue for given CrawlURI
 441:     * @throws IOException
 442:     */
 443:	      protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi) throws IOException {
 444:        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
 445:	          if(hq == null){
 446:            // Need to create it.
 447:            int valence = DEFAULT_HOST_VALENCE.intValue();
 448:	              try {
 449:                valence = ((Integer)getAttribute(curi,ATTR_HOST_VALENCE)).intValue();
 450:            } catch (AttributeNotFoundException e2) {
 451:                logger.severe("Unable to load valence.");
 452:            }
 453:            hq = hostQueues.createHQ(curi.getClassKey(),valence);
 454:        }
 455:        return hq;
 456:    }
 457:
 458:	      protected void batchSchedule(CandidateURI caUri) {
 459:        threadWaiting.getQueue().enqueue(caUri);
 460:    }
 461:
 462:	      protected void batchFlush() {
 463:        innerBatchFlush();
 464:    }
 465:
 466:	      private void innerBatchFlush() {
 467:        Queue q = threadWaiting.getQueue();
 468:	          while(!q.isEmpty()) {
 469:            CandidateURI caUri = (CandidateURI)q.dequeue();
 470:	              if(alreadyIncluded != null){
 471:                String cannon = canonicalize(caUri);
 472:                System.out.println("Cannon of " + caUri + " is " + cannon);
 473:	                  if (caUri.forceFetch()) {
 474:                    alreadyIncluded.addForce(cannon, caUri);
 475:                } else {
 476:                    alreadyIncluded.add(cannon, caUri);
 477:                }
 478:            } else {
 479:                innerSchedule(caUri);
 480:            }
 481:        }
 482:    }
 483:    
 484:    /**
 485:     * @param curi
 486:     * @return the CrawlServer to be associated with this CrawlURI
 487:     */
 488:	      protected CrawlServer getServer(CrawlURI curi) {
 489:        return this.controller.getServerCache().getServerFor(curi);
 490:    }
 491:
 492:    /* (non-Javadoc)
 493:     * @see org.archive.crawler.framework.Frontier#next()
 494:     */
 495:    public synchronized CrawlURI next() 
 496:	              throws InterruptedException, EndedException {
 497:        controller.checkFinish();
 498:        
 499:	          while(shouldPause){
 500:            controller.toePaused();
 501:            wait();
 502:        }
 503:        
 504:	          if(shouldTerminate){
 505:            throw new EndedException("terminated");
 506:        }
 507:        
 508:        AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();
 509:        
 510:	          while(hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY){
 511:            // Ok, so we don't have a ready queue, wait until the top one
 512:            // will become available.
 513:            long waitTime = hq.getNextReadyTime() - System.currentTimeMillis();
 514:	              if(waitTime > 0){
 515:                wait(waitTime);
 516:            }
 517:            // The top HQ may have changed, so get it again
 518:            hq = hostQueues.getTopHQ(); 
 519:        }             
 520:
 521:	          if(shouldTerminate){
 522:            // May have been terminated while thread was waiting for IO
 523:            throw new EndedException("terminated");
 524:        }
 525:        
 526:	          try {
 527:            CrawlURI curi = hq.next();
 528:            // Populate CURI with 'transient' variables such as server.
 529:            logger.fine("Issuing " + curi.toString());
 530:            long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
 531:            long currT = System.currentTimeMillis();
 532:            long overdue = (currT-temp);
 533:	              if(logger.isLoggable(Level.FINER)){
 534:                String waitI = "not set";
 535:	                  if(curi.containsKey(A_WAIT_INTERVAL)){
 536:                    waitI = ArchiveUtils.formatMillisecondsToConventional(
 537:                            curi.getLong(A_WAIT_INTERVAL));
 538:                }
 539:                logger.finer("Wait interval: " + waitI + 
 540:                        ", Time of next proc: " + temp +
 541:                        ", Current time: " + currT +
 542:                        ", Overdue by: " + overdue + "ms");
 543:            }
 544:	              if(overdue < 0){
 545:                // This should never happen.
 546:                logger.severe("Time overdue for " + curi.toString() + 
 547:                        "is negative (" + overdue + ")!");
 548:            }
 549:            curi.putLong(A_FETCH_OVERDUE,overdue);
 550:            return curi;
 551:        } catch (IOException e) {
 552:            // TODO: Need to handle this in an intelligent manner. 
 553:            //       Is probably fatal?
 554:            e.printStackTrace();
 555:        }
 556:
 557:        return null;
 558:    }
 559:
 560:    /* (non-Javadoc)
 561:     * @see org.archive.crawler.framework.Frontier#isEmpty()
 562:     */
 563:	      public boolean isEmpty() {
 564:        // Technically, the Frontier should never become empty since URIs are
 565:        // only discarded under exceptional circumstances.
 566:        return hostQueues.getSize() == 0;
 567:    }
 568:
 569:    /* (non-Javadoc)
 570:     * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
 571:     */
 572:	      public void schedule(CandidateURI caURI) {
 573:        batchSchedule(caURI);        
 574:    }
 575:
 576:    /* (non-Javadoc)
 577:     * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
 578:     */
 579:	      public synchronized void finished(CrawlURI curi) {
 580:        logger.fine(curi.toString()+ " " + 
 581:                CrawlURI.fetchStatusCodesToString(curi.getFetchStatus()));
 582:        curi.incrementFetchAttempts();
 583:        logLocalizedErrors(curi);
 584:
 585:        innerFinished(curi);
 586:    }
 587:    
 588:	      protected synchronized void innerFinished(CrawlURI curi) {
 589:	          try {
 590:            innerBatchFlush();
 591:            
 592:	              if (curi.isSuccess()) {
 593:                successDisposition(curi);
 594:            } else if (needsPromptRetry(curi)) {
 595:                // Consider statuses which allow nearly-immediate retry
 596:                // (like deferred to allow precondition to be fetched)
 597:                reschedule(curi,false);
 598:            } else if (needsRetrying(curi)) {
 599:                // Consider errors which can be retried
 600:                reschedule(curi,true);
 601:                controller.fireCrawledURINeedRetryEvent(curi);
 602:            } else if(isDisregarded(curi)) {
 603:                // Check for codes that mean that while the crawler did
 604:                // manage to get it it must be disregarded for any reason.
 605:                disregardDisposition(curi);
 606:            } else {
 607:                // In that case FAILURE, note & log
 608:                failureDisposition(curi);
 609:            }
 610:
 611:            // New items might be available, let waiting threads know
 612:            // More then one queue might have become available due to 
 613:            // scheduling of items outside the parent URIs host, so we
 614:            // wake all waiting threads.
 615:            notifyAll();
 616:        } catch (RuntimeException e) {
 617:            curi.setFetchStatus(S_RUNTIME_EXCEPTION);
 618:            // store exception temporarily for logging
 619:            logger.warning("RTE in innerFinished() " +
 620:                e.getMessage());
 621:            e.printStackTrace();
 622:            curi.putObject(A_RUNTIME_EXCEPTION, e);
 623:            failureDisposition(curi);
 624:        } catch (AttributeNotFoundException e) {
 625:            logger.severe(e.getMessage());
 626:        }
 627:    }
 628:
 629:    /**
 630:     * Take note of any processor-local errors that have
 631:     * been entered into the CrawlURI.
 632:     * @param curi CrawlURI with errors.
 633:     */
 634:	      private void logLocalizedErrors(CrawlURI curi) {
 635:	          if(curi.containsKey(A_LOCALIZED_ERRORS)) {
 636:            List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);
 637:            Iterator iter = localErrors.iterator();
 638:	              while(iter.hasNext()) {
 639:                Object array[] = {curi, iter.next()};
 640:                controller.localErrors.log(Level.WARNING,
 641:                    curi.getUURI().toString(), array);
 642:            }
 643:            // once logged, discard
 644:            curi.remove(A_LOCALIZED_ERRORS);
 645:        }
 646:    }
 647:    
 648:    /**
 649:     * The CrawlURI has been successfully crawled. 
 650:     *
 651:     * @param curi The CrawlURI
 652:     */
 653:	      protected void successDisposition(CrawlURI curi) {
 654:        curi.aboutToLog();
 655:
 656:        long waitInterval = 0;
 657:        
 658:	          if(curi.containsKey(A_WAIT_INTERVAL)){
 659:            waitInterval = curi.getLong(A_WAIT_INTERVAL);
 660:            curi.addAnnotation("wt:" + 
 661:                    ArchiveUtils.formatMillisecondsToConventional(
 662:                            waitInterval));
 663:        } else {
 664:            logger.severe("Missing wait interval for " + curi.toString() +
 665:                    " WaitEvaluator may be missing.");
 666:        }
 667:	          if(curi.containsKey(A_NUMBER_OF_VISITS)){
 668:            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
 669:        }
 670:	          if(curi.containsKey(A_NUMBER_OF_VERSIONS)){
 671:            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver");
 672:        }
 673:	          if(curi.containsKey(A_FETCH_OVERDUE)){
 674:            curi.addAnnotation("ov:" +
 675:                    ArchiveUtils.formatMillisecondsToConventional(
 676:                    (curi.getLong(A_FETCH_OVERDUE))));
 677:        }
 678:        
 679:        Object array[] = { curi };
 680:        controller.uriProcessing.log(
 681:            Level.INFO,
 682:            curi.getUURI().toString(),
 683:            array);
 684:
 685:        succeededFetchCount++;
 686:        totalProcessedBytes += curi.getContentSize();
 687:
 688:        // Let everyone know in case they want to do something before we strip
 689:        // the curi.
 690:        controller.fireCrawledURISuccessfulEvent(curi);
 691:        
 692:        curi.setSchedulingDirective(CandidateURI.NORMAL);
 693:
 694:        // Set time of next processing
 695:        curi.putLong(A_TIME_OF_NEXT_PROCESSING,
 696:                System.currentTimeMillis()+waitInterval);
 697:        
 698:        
 699:        /* Update HQ */
 700:        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
 701:        
 702:        // Wake up time is based on the time when a fetch was completed + the
 703:        // calculated snooze time for politeness. If the fetch completion time
 704:        // is missing, we'll use current time.
 705:        long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
 706:                curi.getLong(A_FETCH_COMPLETED_TIME):
 707:                    (new Date()).getTime()) + calculateSnoozeTime(curi);
 708:        
 709:        // Ready the URI for reserialization.
 710:        curi.processingCleanup(); 
 711:        curi.resetDeferrals();   
 712:        curi.resetFetchAttempts();
 713:	          try {
 714:            hq.update(curi, true, wakeupTime);
 715:        } catch (IOException e) {
 716:            logger.severe("An IOException occured when updating " + 
 717:                    curi.toString() + "\n" + e.getMessage());
 718:            e.printStackTrace();
 719:        }
 720:    }
 721:
 722:    /**
 723:     * Put near top of relevant hostQueue (but behind anything recently
 724:     * scheduled 'high')..
 725:     *
 726:     * @param curi CrawlURI to reschedule. Its time of next processing is not
 727:     *             modified.
 728:     * @param errorWait signals if there should be a wait before retrying.
 729:     * @throws AttributeNotFoundException
 730:     */
 731:    protected void reschedule(CrawlURI curi, boolean errorWait)
 732:	              throws AttributeNotFoundException {
 733:        long delay = 0;
 734:	          if(errorWait){
 735:	              if(curi.containsKey(A_RETRY_DELAY)) {
 736:                delay = curi.getLong(A_RETRY_DELAY);
 737:            } else {
 738:                // use ARFrontier default
 739:                delay = ((Long)getAttribute(ATTR_RETRY_DELAY,curi)).longValue();
 740:            }
 741:        }
 742:        
 743:        long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
 744:                curi.getLong(A_FETCH_COMPLETED_TIME):
 745:                    (new Date()).getTime()) + delay;
 746:        
 747:        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
 748:        // Ready the URI for reserialization.
 749:        curi.processingCleanup(); 
 750:	          if(errorWait){
 751:            curi.resetDeferrals(); //Defferals only refer to immediate retries.
 752:        }
 753:	          try {
 754:            hq.update(curi, errorWait, retryTime);
 755:        } catch (IOException e) {
 756:            // TODO Handle IOException
 757:            e.printStackTrace();
 758:        }
 759:    }
 760:
 761:    /**
 762:     * The CrawlURI has encountered a problem, and will not
 763:     * be retried.
 764:     *
 765:     * @param curi The CrawlURI
 766:     */
 767:	      protected void failureDisposition(CrawlURI curi) {
 768:        //Let interested listeners know of failed disposition.
 769:        this.controller.fireCrawledURIFailureEvent(curi);
 770:
 771:        // send to basic log
 772:        curi.aboutToLog();
 773:        Object array[] = { curi };
 774:        this.controller.uriProcessing.log(
 775:            Level.INFO,
 776:            curi.getUURI().toString(),
 777:            array);
 778:
 779:        // if exception, also send to crawlErrors
 780:	          if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
 781:            this.controller.runtimeErrors.log(
 782:                Level.WARNING,
 783:                curi.getUURI().toString(),
 784:                array);
 785:        }
 786:        failedFetchCount++;
 787:        
 788:        // Put the failed URI at the very back of the queue.
 789:        curi.setSchedulingDirective(CandidateURI.NORMAL);
 790:        // TODO: reconsider this
 791:        curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);
 792:
 793:        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
 794:        // Ready the URI for serialization.
 795:        curi.processingCleanup();
 796:        curi.resetDeferrals();
 797:        curi.resetFetchAttempts();
 798:	          try {
 799:            // No wait on failure. No contact was made with the server.
 800:            boolean shouldForget = shouldBeForgotten(curi);
 801:	              if(shouldForget && alreadyIncluded != null){
 802:                alreadyIncluded.forget(canonicalize(curi.getUURI()),curi);
 803:            }
 804:            hq.update(curi,false, 0, shouldForget); 
 805:        } catch (IOException e) {
 806:            // TODO Handle IOException
 807:            e.printStackTrace();
 808:        }
 809:    }
 810:
 811:	      protected void disregardDisposition(CrawlURI curi) {
 812:        //Let interested listeners know of disregard disposition.
 813:        controller.fireCrawledURIDisregardEvent(curi);
 814:
 815:        // send to basic log
 816:        curi.aboutToLog();
 817:        Object array[] = { curi };
 818:        controller.uriProcessing.log(
 819:            Level.INFO,
 820:            curi.getUURI().toString(),
 821:            array);
 822:
 823:        disregardedUriCount++;
 824:        
 825:        // Todo: consider timout before retrying disregarded elements.
 826:        //       Possibly add a setting to the WaitEvaluators?
 827:        curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE); 
 828:        curi.setSchedulingDirective(CandidateURI.NORMAL);
 829:
 830:        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
 831:        // Ready the URI for reserialization.
 832:        curi.processingCleanup(); 
 833:        curi.resetDeferrals();
 834:        curi.resetFetchAttempts();
 835:	          try {
 836:            // No politness wait on disregard. No contact was made with server
 837:            hq.update(curi, false, 0, shouldBeForgotten(curi));
 838:        } catch (IOException e) {
 839:            // TODO Handle IOException
 840:            e.printStackTrace();
 841:        }
 842:    }
 843:
 844:    /**
 845:     * Some URIs, if they recur,  deserve another
 846:     * chance at consideration: they might not be too
 847:     * many hops away via another path, or the scope
 848:     * may have been updated to allow them passage.
 849:     *
 850:     * @param curi
 851:     * @return True if curi should be forgotten.
 852:     */
 853:	      protected boolean shouldBeForgotten(CrawlURI curi) {
 854:	          switch(curi.getFetchStatus()) {
 855:            case S_OUT_OF_SCOPE:
 856:            case S_TOO_MANY_EMBED_HOPS:
 857:            case S_TOO_MANY_LINK_HOPS:
 858:                return true;
 859:            default:
 860:                return false;
 861:        }
 862:    }
 863:
 864:    /**
 865:     * Checks if a recently completed CrawlURI that did not finish successfully
 866:     * needs to be retried immediately (processed again as soon as politeness
 867:     * allows.)
 868:     *
 869:     * @param curi The CrawlURI to check
 870:     * @return True if we need to retry promptly.
 871:     * @throws AttributeNotFoundException If problems occur trying to read the
 872:     *            maximum number of retries from the settings framework.
 873:     */
 874:    protected boolean needsPromptRetry(CrawlURI curi)
 875:	              throws AttributeNotFoundException {
 876:        if (curi.getFetchAttempts() >=
 877:	                  ((Integer)getAttribute(ATTR_MAX_RETRIES, curi)).intValue() ) {
 878:            return false;
 879:        }
 880:
 881:	          switch (curi.getFetchStatus()) {
 882:            case S_DEFERRED:
 883:                return true;
 884:
 885:            case HttpStatus.SC_UNAUTHORIZED:
 886:                // We can get here though usually a positive status code is
 887:                // a success.  We get here if there is rfc2617 credential data
 888:                // loaded and we're supposed to go around again.  See if any
 889:                // rfc2617 credential present and if there, assume it got
 890:                // loaded in FetchHTTP on expectation that we're to go around
 891:                // again.  If no rfc2617 loaded, we should not be here.
 892:                boolean loaded = curi.hasRfc2617CredentialAvatar();
 893:	                  if (!loaded) {
 894:                    logger.severe("Have 401 but no creds loaded " + curi);
 895:                }
 896:                return loaded;
 897:
 898:            default:
 899:                return false;
 900:        }
 901:    }
 902:
 903:    /**
 904:     * Checks if a recently completed CrawlURI that did not finish successfully
 905:     * needs to be retried (processed again after some time elapses)
 906:     *
 907:     * @param curi The CrawlURI to check
 908:     * @return True if we need to retry.
 909:     * @throws AttributeNotFoundException If problems occur trying to read the
 910:     *            maximum number of retries from the settings framework.
 911:     */
 912:    protected boolean needsRetrying(CrawlURI curi)
 913:	              throws AttributeNotFoundException {
 914:        // Check to see if maximum number of retries has been exceeded.
 915:        if (curi.getFetchAttempts() >= 
 916:	              ((Integer)getAttribute(ATTR_MAX_RETRIES,curi)).intValue() ) {
 917:            return false;
 918:        } else {
 919:            // Check if FetchStatus indicates that a delayed retry is needed.
 920:	              switch (curi.getFetchStatus()) {
 921:                case S_CONNECT_FAILED:
 922:                case S_CONNECT_LOST:
 923:                case S_DOMAIN_UNRESOLVABLE:
 924:                    // these are all worth a retry
 925:                    // TODO: consider if any others (S_TIMEOUT in some cases?) 
 926:                    //       deserve retry
 927:                    return true;
 928:                default:
 929:                    return false;
 930:            }
 931:        }
 932:    }
 933:    
 934:	      protected boolean isDisregarded(CrawlURI curi) {
 935:	          switch (curi.getFetchStatus()) {
 936:            case S_ROBOTS_PRECLUDED :     // they don't want us to have it
 937:            case S_OUT_OF_SCOPE :         // filtered out by scope
 938:            case S_BLOCKED_BY_CUSTOM_PROCESSOR:
 939:            case S_BLOCKED_BY_USER :      // filtered out by user
 940:            case S_TOO_MANY_EMBED_HOPS :  // too far from last true link
 941:            case S_TOO_MANY_LINK_HOPS :   // too far from seeds
 942:            case S_DELETED_BY_USER :      // user deleted
 943:                return true;
 944:            default:
 945:                return false;
 946:        }
 947:    }
 948:    
 949:    /**
 950:     * Calculates how long a host queue needs to be snoozed following the
 951:     * crawling of a URI.
 952:     *
 953:     * @param curi The CrawlURI
 954:     * @return How long to snooze.
 955:     */
 956:	      protected long calculateSnoozeTime(CrawlURI curi) {
 957:        long durationToWait = 0;
 958:        if (curi.containsKey(A_FETCH_BEGAN_TIME)
 959:	              && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
 960:            
 961:	              try{
 962:            
 963:                long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
 964:                long durationTaken = 
 965:                    (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));
 966:                
 967:                durationToWait = (long)(
 968:                        ((Float) getAttribute(ATTR_DELAY_FACTOR, curi))
 969:                            .floatValue() * durationTaken);
 970:    
 971:                long minDelay = 
 972:                    ((Integer) getAttribute(ATTR_MIN_DELAY, curi)).longValue();
 973:                
 974:	                  if (minDelay > durationToWait) {
 975:                    // wait at least the minimum
 976:                    durationToWait = minDelay;
 977:                }
 978:    
 979:                long maxDelay = ((Integer) getAttribute(ATTR_MAX_DELAY, curi)).longValue();
 980:	                  if (durationToWait > maxDelay) {
 981:                    // wait no more than the maximum
 982:                    durationToWait = maxDelay;
 983:                }
 984:            } catch (AttributeNotFoundException e) {
 985:                logger.severe("Unable to find attribute. " + 
 986:                        curi.toString());
 987:                //Wait for max interval.
 988:                durationToWait = DEFAULT_MAX_DELAY.longValue();
 989:            }
 990:
 991:        }
 992:        long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ? 
 993:                durationToWait : DEFAULT_MIN_DELAY.longValue();
 994:        logger.finest("Snooze time for " + curi.toString() + " = " + ret );
 995:        return ret;
 996:    }
 997:
 998:    /* (non-Javadoc)
 999:     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
1000:     */
1001:	      public synchronized long discoveredUriCount() {
1002:        return (this.alreadyIncluded != null) ? 
1003:                this.alreadyIncluded.count() : hostQueues.getSize();
1004:    }
1005:
1006:    /* (non-Javadoc)
1007:     * @see org.archive.crawler.framework.Frontier#queuedUriCount()
1008:     */
1009:	      public synchronized long queuedUriCount() {
1010:        return hostQueues.getSize();
1011:    }
1012:
1013:    /* (non-Javadoc)
1014:     * @see org.archive.crawler.framework.Frontier#finishedUriCount()
1015:     */
1016:	      public long finishedUriCount() {
1017:        return succeededFetchCount+failedFetchCount+disregardedUriCount;
1018:    }
1019:
1020:    /* (non-Javadoc)
1021:     * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
1022:     */
1023:	      public long succeededFetchCount() {
1024:        return succeededFetchCount;
1025:    }
1026:
1027:    /* (non-Javadoc)
1028:     * @see org.archive.crawler.framework.Frontier#failedFetchCount()
1029:     */
1030:	      public long failedFetchCount() {
1031:        return failedFetchCount;
1032:    }
1033:
1034:    /* (non-Javadoc)
1035:     * @see org.archive.crawler.framework.Frontier#disregardedUriCount()
1036:     */
1037:	      public long disregardedUriCount() {
1038:        return disregardedUriCount++;
1039:    }
1040:
1041:    /* (non-Javadoc)
1042:     * @see org.archive.crawler.framework.Frontier#totalBytesWritten()
1043:     */
1044:	      public long totalBytesWritten() {
1045:        return totalProcessedBytes;
1046:    }
1047:
1048:    /**
1049:     * Method is not supported by this Frontier implementation..
1050:     * @param pathToLog
1051:     * @throws IOException
1052:     */
1053:	      public void importRecoverLog(String pathToLog) throws IOException {
1054:        throw new IOException("Unsupported by this frontier.");
1055:    }
1056:
1057:    public synchronized FrontierMarker getInitialMarker(String regexpr,
1058:	              boolean inCacheOnly) {
1059:        return null;
1060:    }
1061:
1062:    /* (non-Javadoc)
1063:     * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)
1064:     */
1065:    public synchronized ArrayList getURIsList(FrontierMarker marker,
1066:            int numberOfMatches, boolean verbose)
1067:	          throws InvalidFrontierMarkerException {
1068:        // TODO Auto-generated method stub
1069:        return null;
1070:    }
1071:
1072:    /* (non-Javadoc)
1073:     * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)
1074:     */
1075:	      public synchronized long deleteURIs(String match) {
1076:        // TODO Auto-generated method stub
1077:        return 0;
1078:    }
1079:
1080:    /* (non-Javadoc)
1081:     * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1082:     */
1083:	      public synchronized void deleted(CrawlURI curi) {
1084:        // TODO Auto-generated method stub
1085:    }
1086:
1087:	      public void considerIncluded(UURI u) {
1088:        // This will cause the URI to be crawled!!!
1089:        CrawlURI curi = new CrawlURI(u);
1090:        innerSchedule(curi);
1091:
1092:    }
1093:
1094:	      public void kickUpdate() {
1095:        loadSeeds();
1096:    }
1097:    
1098:	      public void start() {
1099:        unpause(); 
1100:    }
1101:    
1102:	      synchronized public void pause() { 
1103:        shouldPause = true;
1104:        notifyAll();
1105:    }
1106:	      synchronized public void unpause() { 
1107:        shouldPause = false;
1108:        notifyAll();
1109:    }
1110:	      synchronized public void terminate() { 
1111:        shouldTerminate = true;
1112:    }  
1113:
1114:    /* (non-Javadoc)
1115:     * @see org.archive.crawler.framework.Frontier#getFrontierJournal()
1116:     */
1117:	      public FrontierJournal getFrontierJournal() {
1118:        return null;
1119:    }
1120:
1121:    private static class ThreadLocalQueue
1122:	      extends ThreadLocal<Queue<CandidateURI>> implements Serializable {
1123:
1124:        private static final long serialVersionUID = 8268977225156462059L;
1125:
1126:	          protected Queue<CandidateURI> initialValue() {
1127:            return new MemQueue<CandidateURI>();
1128:        }
1129:
1130:        /**
1131:         * @return Queue of 'batched' items
1132:         */
1133:	          public Queue<CandidateURI> getQueue() {
1134:            return get();
1135:        }
1136:    }
1137:    
1138:    /**
1139:     * This method is not supported by this Frontier implementation
1140:     * @param pathToLog
1141:     * @param retainFailures
1142:     * @throws IOException
1143:     */
1144:    public void importRecoverLog(String pathToLog, boolean retainFailures)
1145:	      throws IOException {
1146:        throw new IOException("Unsupported");
1147:    }
1148:
1149:    //
1150:    // Reporter implementation
1151:    //
1152:    
1153:	      public String[] getReports() {
1154:        // none but default for now
1155:        return new String[] {};
1156:    }
1157:    
1158:    /* (non-Javadoc)
1159:     * @see org.archive.util.Reporter#singleLineReport()
1160:     */
1161:	      public String singleLineReport() {
1162:        return ArchiveUtils.singleLineReport(this);
1163:    }
1164:
1165:    /* (non-Javadoc)
1166:     * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1167:     */
1168:	      public void reportTo(PrintWriter writer) throws IOException {
1169:        reportTo(null,writer);
1170:    }
1171:    
1172:    /* (non-Javadoc)
1173:     * @see org.archive.crawler.framework.Frontier#oneLineReport()
1174:     */
1175:	      public synchronized void singleLineReportTo(PrintWriter w) throws IOException {
1176:        hostQueues.singleLineReportTo(w);
1177:    }
1178:
1179:    /* (non-Javadoc)
1180:     * @see org.archive.util.Reporter#singleLineLegend()
1181:     */
1182:	      public String singleLineLegend() {
1183:        return hostQueues.singleLineLegend();
1184:    }
1185:    
1186:    /* (non-Javadoc)
1187:     * @see org.archive.crawler.framework.Frontier#report()
1188:     */
1189:	      public synchronized void reportTo(String name, PrintWriter writer) {
1190:        // ignore name; only one report for now
1191:        hostQueues.reportTo(name, writer);
1192:    }
1193:
1194:    /* (non-Javadoc)
1195:     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1196:     */
1197:	      public void crawlStarted(String message) {
1198:        // Not interested
1199:    }
1200:
1201:    /* (non-Javadoc)
1202:     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1203:     */
1204:	      public void crawlEnding(String sExitMessage) {
1205:        // Not interested
1206:    }
1207:
1208:    /* (non-Javadoc)
1209:     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1210:     */
1211:	      public void crawlEnded(String sExitMessage) {
1212:        // Cleanup!
1213:	          if (this.alreadyIncluded != null) {
1214:            this.alreadyIncluded.close();
1215:            this.alreadyIncluded = null;
1216:        }
1217:        hostQueues.close();
1218:    }
1219:
1220:    /* (non-Javadoc)
1221:     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1222:     */
1223:	      public void crawlPausing(String statusMessage) {
1224:        // Not interested
1225:    }
1226:
1227:    /* (non-Javadoc)
1228:     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1229:     */
1230:	      public void crawlPaused(String statusMessage) {
1231:        // Not interested
1232:    }
1233:
1234:    /* (non-Javadoc)
1235:     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1236:     */
1237:	      public void crawlResuming(String statusMessage) {
1238:        // Not interested
1239:    }
1240:
1241:    /* (non-Javadoc)
1242:     * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
1243:     */
1244:	      public void crawlCheckpoint(File checkpointDir) throws Exception {
1245:        // Not interested
1246:    }
1247:
1248:    /* (non-Javadoc)
1249:     * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
1250:     */
1251:	      public void receive(CandidateURI item) {
1252:        System.out.println("Received " + item);
1253:        innerSchedule(item);        
1254:    }
1255:
1256:    /* (non-Javadoc)
1257:     * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)
1258:     */
1259:	      public FrontierGroup getGroup(CrawlURI curi) {
1260:	          try {
1261:            return getHQ(curi);
1262:        } catch (IOException ioe) {
1263:            throw new RuntimeException(ioe);
1264:        }
1265:    }
1266:    
1267:	      public long averageDepth() {
1268:        return hostQueues.getAverageDepth();
1269:    }
1270:    
1271:	      public float congestionRatio() {
1272:        return hostQueues.getCongestionRatio();
1273:    }
1274:    
1275:	      public long deepestUri() {
1276:        return hostQueues.getDeepestQueueSize();
1277:    }
1278:}