Java Source Code: org.archive.crawler.fetcher.FetchHTTP


   1: /* FetchHTTP.java
   2:  *
   3:  * $Id: FetchHTTP.java,v 1.113.2.1 2007/01/13 01:31:17 stack-sf Exp $
   4:  *
   5:  * Created on Jun 5, 2003
   6:  *
   7:  * Copyright (C) 2003 Internet Archive.
   8:  *
   9:  * This file is part of the Heritrix web crawler (crawler.archive.org).
  10:  *
  11:  * Heritrix is free software; you can redistribute it and/or modify
  12:  * it under the terms of the GNU Lesser Public License as published by
  13:  * the Free Software Foundation; either version 2.1 of the License, or
  14:  * any later version.
  15:  *
  16:  * Heritrix is distributed in the hope that it will be useful,
  17:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19:  * GNU Lesser Public License for more details.
  20:  *
  21:  * You should have received a copy of the GNU Lesser Public License
  22:  * along with Heritrix; if not, write to the Free Software
  23:  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24:  */
  25: package org.archive.crawler.fetcher;
  26: 
  27: import it.unimi.dsi.mg4j.util.MutableString;
  28: 
  29: import java.io.File;
  30: import java.io.FileNotFoundException;
  31: import java.io.FileOutputStream;
  32: import java.io.IOException;
  33: import java.io.ObjectInputStream;
  34: import java.io.ObjectOutputStream;
  35: import java.io.RandomAccessFile;
  36: import java.security.KeyManagementException;
  37: import java.security.KeyStoreException;
  38: import java.security.NoSuchAlgorithmException;
  39: import java.util.Collection;
  40: import java.util.HashSet;
  41: import java.util.Iterator;
  42: import java.util.List;
  43: import java.util.ListIterator;
  44: import java.util.Map;
  45: import java.util.Set;
  46: import java.util.logging.Level;
  47: import java.util.logging.Logger;
  48: import java.net.InetAddress;
  49: import java.net.UnknownHostException;
  50: 
  51: import javax.management.AttributeNotFoundException;
  52: import javax.management.MBeanException;
  53: import javax.management.ReflectionException;
  54: import javax.net.ssl.SSLContext;
  55: import javax.net.ssl.SSLSocketFactory;
  56: import javax.net.ssl.TrustManager;
  57: 
  58: import org.apache.commons.httpclient.Cookie;
  59: import org.apache.commons.httpclient.Header;
  60: import org.apache.commons.httpclient.HostConfiguration;
  61: import org.apache.commons.httpclient.HttpClient;
  62: import org.apache.commons.httpclient.HttpConnection;
  63: import org.apache.commons.httpclient.HttpConnectionManager;
  64: import org.apache.commons.httpclient.HttpException;
  65: import org.apache.commons.httpclient.HttpMethod;
  66: import org.apache.commons.httpclient.HttpMethodBase;
  67: import org.apache.commons.httpclient.HttpState;
  68: import org.apache.commons.httpclient.HttpStatus;
  69: import org.apache.commons.httpclient.HttpVersion;
  70: import org.apache.commons.httpclient.auth.AuthChallengeParser;
  71: import org.apache.commons.httpclient.auth.AuthScheme;
  72: import org.apache.commons.httpclient.auth.BasicScheme;
  73: import org.apache.commons.httpclient.auth.DigestScheme;
  74: import org.apache.commons.httpclient.auth.MalformedChallengeException;
  75: import org.apache.commons.httpclient.cookie.CookiePolicy;
  76: import org.apache.commons.httpclient.params.HttpClientParams;
  77: import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
  78: import org.apache.commons.httpclient.params.HttpMethodParams;
  79: import org.apache.commons.httpclient.protocol.Protocol;
  80: import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
  81: import org.archive.crawler.Heritrix;
  82: import org.archive.crawler.datamodel.CoreAttributeConstants;
  83: import org.archive.crawler.datamodel.CrawlHost;
  84: import org.archive.crawler.datamodel.CrawlOrder;
  85: import org.archive.crawler.datamodel.CrawlServer;
  86: import org.archive.crawler.datamodel.CrawlURI;
  87: import org.archive.crawler.datamodel.CredentialStore;
  88: import org.archive.crawler.datamodel.FetchStatusCodes;
  89: import org.archive.crawler.datamodel.ServerCache;
  90: import org.archive.crawler.datamodel.credential.Credential;
  91: import org.archive.crawler.datamodel.credential.CredentialAvatar;
  92: import org.archive.crawler.datamodel.credential.Rfc2617Credential;
  93: import org.archive.crawler.event.CrawlStatusListener;
  94: import org.archive.crawler.framework.Filter;
  95: import org.archive.crawler.framework.Processor;
  96: import org.archive.crawler.settings.MapType;
  97: import org.archive.crawler.settings.SettingsHandler;
  98: import org.archive.crawler.settings.SimpleType;
  99: import org.archive.crawler.settings.StringList;
 100: import org.archive.crawler.settings.Type;
 101: import org.archive.httpclient.ConfigurableX509TrustManager;
 102: import org.archive.httpclient.HttpRecorderGetMethod;
 103: import org.archive.httpclient.HttpRecorderMethod;
 104: import org.archive.httpclient.HttpRecorderPostMethod;
 105: import org.archive.httpclient.SingleHttpConnectionManager;
 106: import org.archive.io.ObjectPlusFilesInputStream;
 107: import org.archive.io.RecorderLengthExceededException;
 108: import org.archive.io.RecorderTimeoutException;
 109: import org.archive.io.RecorderTooMuchHeaderException;
 110: import org.archive.util.ArchiveUtils;
 111: import org.archive.util.HttpRecorder;
 112: 
 113: import com.sleepycat.bind.serial.SerialBinding;
 114: import com.sleepycat.bind.serial.StoredClassCatalog;
 115: import com.sleepycat.bind.tuple.StringBinding;
 116: import com.sleepycat.collections.StoredSortedMap;
 117: import com.sleepycat.je.Database;
 118: import com.sleepycat.je.DatabaseConfig;
 119: import com.sleepycat.je.DatabaseException;
 120: import com.sleepycat.je.Environment;
 121: 
 122: /**
 123:  * HTTP fetcher that uses <a
 124:  * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
 125:  * HttpClient</a> library.
 126:  *
 127:  * @author Gordon Mohr
 128:  * @author Igor Ranitovic
 129:  * @author others
 130:  * @version $Id: FetchHTTP.java,v 1.113.2.1 2007/01/13 01:31:17 stack-sf Exp $
 131:  */
 132: public class FetchHTTP extends Processor
 133:	  implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
 134:    // be robust against trivial implementation changes
 135:    private static final long serialVersionUID =
 136:        ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);
 137:    
 138:    private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());
 139:
 140:    public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
 141:    public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
 142:    public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
 143:    public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
 144:    public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
 145:    public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
 146:    public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
 147:    public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
 148:    public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
 149:    public static final String ATTR_SHA1_CONTENT = "sha1-content";
 150:    public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
 151:   
 152:    /**
 153:     * SSL trust level setting attribute name.
 154:     */
 155:    public static final String ATTR_TRUST = "trust-level";
 156:    
 157:    private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
 158:    private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
 159:    private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
 160:    private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
 161:
 162:    /**
 163:     * This is the default value pre-1.4. Needs special handling else
 164:     * treated as negative number doing math later in processing.
 165:     */
 166:    private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
 167:
 168:    /**
 169:     * Default character encoding to use for pages that do not specify.
 170:     */
 171:    private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
 172:
 173:    /**
 174:     * Default whether to perform on-the-fly SHA1 hashing of content-bodies.
 175:     */
 176:    static Boolean DEFAULT_SHA1_CONTENT = new Boolean(true);
 177:    public static final String SHA1 = "sha1";
 178:
 179:    private transient HttpClient http = null;
 180:
 181:    /**
 182:     * How many 'instant retries' of HttpRecoverableExceptions have occurred
 183:     * 
 184:     * Would like it to be 'long', but longs aren't atomic
 185:     */
 186:    private int recoveryRetries = 0;
 187:
 188:    /**
 189:     * Count of crawl uris handled.
 190:     * Would like to be 'long', but longs aren't atomic
 191:     */
 192:    private int curisHandled = 0;
 193:        
 194:    /**
 195:     * Filters to apply mid-fetch, just after receipt of the response
 196:     * headers before we start to download body.
 197:     */
 198:    public final static String MIDFETCH_ATTR_FILTERS = "midfetch-filters";
 199:
 200:    /**
 201:     * Instance of midfetchfilters.
 202:     */
 203:    private MapType midfetchfilters = null;
 204:    
 205:    /**
 206:     * What to log if midfetch abort.
 207:     */
 208:    private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
 209:    
 210:    public static final String ATTR_SEND_CONNECTION_CLOSE =
 211:        "send-connection-close";
 212:    private static final Header HEADER_SEND_CONNECTION_CLOSE =
 213:        new Header("Connection", "close");
 214:    public static final String ATTR_SEND_REFERER = "send-referer";
 215:    public static final String ATTR_SEND_RANGE = "send-range";
 216:    public static final String REFERER = "Referer";
 217:    public static final String RANGE = "Range";
 218:    public static final String RANGE_PREFIX = "bytes=0-";
 219:    public static final String HTTP_SCHEME = "http";
 220:    public static final String HTTPS_SCHEME = "https";
 221:    
 222:    public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
 223:    private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
 224:
 225:    public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
 226:    private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
 227:    
 228:    public static final String ATTR_LOCAL_ADDRESS = "bind-address";
 229:    
 230:    /**
 231:     * Database backing cookie map, if using BDB
 232:     */
 233:    protected Database cookieDb; 
 234:    /**
 235:     * Name of cookie BDB Database
 236:     */
 237:    public static final String COOKIEDB_NAME = "http_cookies";
 238:    
 239:	      static {
 240:        Protocol.registerProtocol("http", new Protocol("http",
 241:            new HeritrixProtocolSocketFactory(), 80));
 242:	          try {
 243:            Protocol.registerProtocol("https",
 244:                new Protocol("https", ((ProtocolSocketFactory)
 245:                    new HeritrixSSLProtocolSocketFactory()), 443));
 246:        } catch (KeyManagementException e) {
 247:            e.printStackTrace();
 248:        } catch (KeyStoreException e) {
 249:            e.printStackTrace();
 250:        } catch (NoSuchAlgorithmException e) {
 251:            e.printStackTrace();
 252:        }
 253:    }
 254:    static final String SERVER_CACHE_KEY = "heritrix.server.cache";
 255:    static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
 256:    
 257:    /***
 258:     * Socket factory that has the configurable trust manager installed.
 259:     */
 260:    private SSLSocketFactory sslfactory = null;
 261:    
 262:
 263:    /**
 264:     * Constructor.
 265:     *
 266:     * @param name Name of this processor.
 267:     */
 268:	      public FetchHTTP(String name) {
 269:        super(name, "HTTP Fetcher");
 270:        this.midfetchfilters = (MapType) addElementToDefinition(
 271:            new MapType(MIDFETCH_ATTR_FILTERS, "Filters applied after" +
 272:                " receipt of HTTP response headers but before we start to" +
 273:                " download the body. If any filter returns" +
 274:                " FALSE, the fetch is aborted. Prerequisites such as" +
 275:                " robots.txt by-pass filtering (i.e. they cannot be" +
 276:                " midfetch aborted.", Filter.class));
 277:// see [ 1379040 ] regex for midfetch filter not being stored in crawl order
 278:// http://sourceforge.net/support/tracker.php?aid=1379040
 279://        this.midfetchfilters.setExpertSetting(true);
 280:        
 281:        addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
 282:            "If the fetch is not completed in this number of seconds,"
 283:            + " give up (and retry later). For optimal configuration, " +
 284:            " ensure this value is > " + ATTR_TIMEOUT_SECONDS + ".",
 285:            DEFAULT_TIMEOUT_SECONDS));
 286:        Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
 287:            "If the socket is unresponsive for this number of milliseconds, " +
 288:            " give up.  Set to zero for no timeout (Not." +
 289:            " recommended. Could hang a thread on an unresponsive server)." +
 290:            " This timeout is used timing out socket opens " +
 291:            " and for timing out each socket read.  Make sure this " +
 292:            " value is < " + ATTR_TIMEOUT_SECONDS + " for optimal " +
 293:            " configuration: ensures at least one retry read.",
 294:                DEFAULT_SOTIMEOUT_MS));
 295:        e.setExpertSetting(true);
 296:        e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,
 297:            "The maximum KB/sec to use when fetching data from a server. " +
 298:            "0 means no maximum.  Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX
 299:             + ".", DEFAULT_FETCH_BANDWIDTH_MAX));
 300:        e.setExpertSetting(true);
 301:        e.setOverrideable(true);
 302:        addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
 303:            "Maximum length in bytes to fetch.\n" +
 304:            "Fetch is truncated at this length. A value of 0 means no limit.",
 305:            DEFAULT_MAX_LENGTH_BYTES));
 306:        e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
 307:            "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
 308:        e.setOverrideable(true);
 309:        e.setExpertSetting(true);
 310:        e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
 311:                "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
 312:        e.setExpertSetting(true);
 313:
 314:        e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
 315:            "File to preload cookies from", ""));
 316:        e.setExpertSetting(true);
 317:        e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
 318:            "When crawl finishes save cookies to this file", ""));
 319:        e.setExpertSetting(true);
 320:        e = addElementToDefinition(new SimpleType(ATTR_TRUST,
 321:            "SSL certificate trust level.  Range is from the default 'open'"
 322:            + " (trust all certs including expired, selfsigned, and those for"
 323:            + " which we do not have a CA) through 'loose' (trust all valid"
 324:            + " certificates including selfsigned), 'normal' (all valid"
 325:            + " certificates not including selfsigned) to 'strict' (Cert is"
 326:            + " valid and DN must match servername)",
 327:            ConfigurableX509TrustManager.DEFAULT,
 328:            ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
 329:        e.setOverrideable(false);
 330:        e.setExpertSetting(true);
 331:        e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,
 332:            "Accept Headers to include in each request. Each must be the"
 333:            + " complete header, e.g., 'Accept-Language: en'"));
 334:        e.setExpertSetting(true);
 335:        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
 336:            "Proxy host IP (set only if needed).", ""));
 337:        e.setExpertSetting(true);
 338:        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
 339:            "Proxy port (set only if needed)", ""));
 340:        e.setExpertSetting(true);
 341:        e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
 342:            "The character encoding to use for files that do not have one" +
 343:            " specified in the HTTP response headers.  Default: " +
 344:            DEFAULT_CONTENT_CHARSET + ".",
 345:            DEFAULT_CONTENT_CHARSET));
 346:        e.setExpertSetting(true);
 347:        e = addElementToDefinition(new SimpleType(ATTR_SHA1_CONTENT,
 348:                "Whether or not to perform an on-the-fly SHA1 hash of" +
 349:                "retrieved content-bodies.",
 350:                DEFAULT_SHA1_CONTENT));
 351:        e.setExpertSetting(true);
 352:        e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
 353:            "Send 'Connection: close' header with every request.",
 354:             new Boolean(true)));
 355:        e.setOverrideable(true);
 356:        e.setExpertSetting(true);
 357:        e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
 358:             "Send 'Referer' header with every request.\n" +
 359:             "The 'Referer' header contans the location the crawler came " +
 360:             " from, " +
 361:             "the page the current URI was discovered in. The 'Referer' " +
 362:             "usually is " +
 363:             "logged on the remote server and can be of assistance to " +
 364:             "webmasters trying to figure how a crawler got to a " +
 365:             "particular area on a site.",
 366:             new Boolean(true)));
 367:        e.setOverrideable(true);
 368:        e.setExpertSetting(true);
 369:        e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
 370:              "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +
 371:              ") on document size.\n" +
 372:              "Be polite to the HTTP servers and send the 'Range' header," +
 373:              "stating that you are only interested in the first n bytes. " +
 374:              "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +
 375:              "Sending the 'Range' header results in a " +
 376:              "'206 Partial Content' status response, which is better than " +
 377:              "just cutting the response mid-download. On rare occasion, " +
 378:              " sending 'Range' will " +
 379:              "generate '416 Request Range Not Satisfiable' response.",
 380:              new Boolean(false)));
 381:           e.setOverrideable(true);
 382:           e.setExpertSetting(true);
 383:           e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS,
 384:               "Local IP address or hostname to use when making connections " +
 385:               "(binding sockets). When not specified, uses default local" +
 386:               "address(es).", ""));
 387:           e.setExpertSetting(true);
 388:    }
 389:
 390:    protected void innerProcess(final CrawlURI curi)
 391:	      throws InterruptedException {
 392:	          if (!canFetch(curi)) {
 393:            // Cannot fetch this, due to protocol, retries, or other problems
 394:            return;
 395:        }
 396:
 397:        this.curisHandled++;
 398:
 399:        // Note begin time
 400:        curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
 401:
 402:        // Get a reference to the HttpRecorder that is set into this ToeThread.
 403:        HttpRecorder rec = HttpRecorder.getHttpRecorder();
 404:        
 405:        // Shall we get a digest on the content downloaded?
 406:        boolean sha1Content = ((Boolean)getUncheckedAttribute(curi,
 407:            ATTR_SHA1_CONTENT)).booleanValue();
 408:	          if(sha1Content) {
 409:            rec.getRecordedInput().setSha1Digest();
 410:        } else {
 411:            // clear
 412:            rec.getRecordedInput().setDigest(null);
 413:        }
 414:        
 415:        // Below we do two inner classes that add check of midfetch
 416:        // filters just as we're about to receive the response body.
 417:        String curiString = curi.getUURI().toString();
 418:        HttpMethodBase method = null;
 419:	          if (curi.isPost()) {
 420:	              method = new HttpRecorderPostMethod(curiString, rec) {
 421:                protected void readResponseBody(HttpState state,
 422:                        HttpConnection conn)
 423:	                  throws IOException, HttpException {
 424:                    addResponseContent(this, curi);
 425:	                      if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
 426:                        doAbort(curi, this, MIDFETCH_ABORT_LOG);
 427:                    } else {
 428:                        super.readResponseBody(state, conn);
 429:                    }
 430:                }
 431:            };
 432:        } else {
 433:	              method = new HttpRecorderGetMethod(curiString, rec) {
 434:                protected void readResponseBody(HttpState state,
 435:                        HttpConnection conn)
 436:	                  throws IOException, HttpException {
 437:                    addResponseContent(this, curi);
 438:                    if (checkMidfetchAbort(curi, this.httpRecorderMethod,
 439:	                              conn)) {
 440:                        doAbort(curi, this, MIDFETCH_ABORT_LOG);
 441:                    } else {
 442:                        super.readResponseBody(state, conn);
 443:                    }
 444:                }
 445:            };
 446:        }
 447:
 448:        HostConfiguration customConfigOrNull = configureMethod(curi, method);
 449:        
 450:        // Set httpRecorder into curi. Subsequent code both here and later
 451:        // in extractors expects to find the HttpRecorder in the CrawlURI.
 452:        curi.setHttpRecorder(rec);
 453:        
 454:        // Populate credentials. Set config so auth. is not automatic.
 455:        boolean addedCredentials = populateCredentials(curi, method);
 456:        method.setDoAuthentication(addedCredentials);
 457:        
 458:	          try {
 459:            this.http.executeMethod(customConfigOrNull, method);
 460:        } catch (RecorderTooMuchHeaderException ex) {
 461:            // when too much header material, abort like other truncations
 462:            doAbort(curi, method, HEADER_TRUNC);
 463:        } catch (IOException e) {
 464:            failedExecuteCleanup(method, curi, e);
 465:            return;
 466:        } catch (ArrayIndexOutOfBoundsException e) {
 467:            // For weird windows-only ArrayIndex exceptions in native
 468:            // code... see
 469:            // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
 470:            // treating as if it were an IOException
 471:            failedExecuteCleanup(method, curi, e);
 472:            return;
 473:        }
 474:        
 475:        // set softMax on bytes to get (if implied by content-length) 
 476:        long softMax = method.getResponseContentLength();
 477:        
 478:        // set hardMax on bytes (if set by operator)
 479:        long hardMax = getMaxLength(curi);
 480:
 481:    // Get max fetch rate (bytes/ms). It comes in in KB/sec, which
 482:    // requires nothing to normalize.
 483:        int maxFetchRate = getMaxFetchRate(curi);
 484:
 485:	          try {
 486:	              if (!method.isAborted()) {
 487:                // Force read-to-end, so that any socket hangs occur here,
 488:                // not in later modules.
 489:                rec.getRecordedInput().readFullyOrUntil(softMax,
 490:                        hardMax, 1000 * getTimeout(curi), maxFetchRate);
 491:            }
 492:        } catch (RecorderTimeoutException ex) {
 493:            doAbort(curi, method, TIMER_TRUNC);
 494:        } catch (RecorderLengthExceededException ex) {
 495:            doAbort(curi, method, LENGTH_TRUNC);
 496:        } catch (IOException e) {
 497:            cleanup(curi, e, "readFully", S_CONNECT_LOST);
 498:            return;
 499:        } catch (ArrayIndexOutOfBoundsException e) {
 500:            // For weird windows-only ArrayIndex exceptions from native code
 501:            // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
 502:            // treating as if it were an IOException
 503:            cleanup(curi, e, "readFully", S_CONNECT_LOST);
 504:            return;
 505:        } finally {
 506:            // ensure recording has stopped
 507:            rec.closeRecorders();
 508:	              if (!method.isAborted()) {
 509:                method.releaseConnection();
 510:            }
 511:            // Note completion time
 512:            curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
 513:            // Set the response charset into the HttpRecord if available.
 514:            setCharacterEncoding(rec, method);
 515:            curi.setContentSize(rec.getRecordedInput().getSize());
 516:        }
 517:        
 518:        curi.setContentDigest(SHA1, rec.getRecordedInput().getDigestValue());
 519:	          if (logger.isLoggable(Level.INFO)) {
 520:            logger.info((curi.isPost()? "POST": "GET") + " " +
 521:                curi.getUURI().toString() + " " + method.getStatusCode() +
 522:                " " + rec.getRecordedInput().getSize() + " " +
 523:                curi.getContentType());
 524:        }
 525:
 526:	          if (curi.isSuccess() && addedCredentials) {
 527:            // Promote the credentials from the CrawlURI to the CrawlServer
 528:            // so they are available for all subsequent CrawlURIs on this
 529:            // server.
 530:            promoteCredentials(curi);
 531:	              if (logger.isLoggable(Level.FINE)) {
 532:                // Print out the cookie.  Might help with the debugging.
 533:                Header setCookie = method.getResponseHeader("set-cookie");
 534:	                  if (setCookie != null) {
 535:                    logger.fine(setCookie.toString().trim());
 536:                }
 537:            }
 538:        } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
 539:            // 401 is not 'success'.
 540:            handle401(method, curi);
 541:        }
 542:        
 543:	          if (rec.getRecordedInput().isOpen()) {
 544:            logger.severe(curi.toString() + " RIS still open. Should have" +
 545:                " been closed by method release: " +
 546:                Thread.currentThread().getName());
 547:	              try {
 548:                rec.getRecordedInput().close();
 549:            } catch (IOException e) {
 550:                logger.log(Level.SEVERE,"second-chance RIS close failed",e);
 551:            }
 552:        }
 553:    }
 554:    
 555:    protected void doAbort(CrawlURI curi, HttpMethod method,
 556:	              String annotation) {
 557:        curi.addAnnotation(annotation);
 558:        curi.getHttpRecorder().close();
 559:        method.abort();
 560:    }
 561:    
 562:    protected boolean checkMidfetchAbort(CrawlURI curi,
 563:	              HttpRecorderMethod method, HttpConnection conn) {
 564:	          if (curi.isPrerequisite() || filtersAccept(midfetchfilters, curi)) {
 565:            return false;
 566:        }
 567:        method.markContentBegin(conn);
 568:        return true;
 569:    }
 570:    
 571:    /**
 572:     * This method populates <code>curi</code> with response status and
 573:     * content type.
 574:     * @param curi CrawlURI to populate.
 575:     * @param method Method to get response status and headers from.
 576:     */
 577:	      protected void addResponseContent (HttpMethod method, CrawlURI curi) {
 578:        curi.setFetchStatus(method.getStatusCode());
 579:        Header ct = method.getResponseHeader("content-type");
 580:        curi.setContentType((ct == null)? null: ct.getValue());
 581:        // Save method into curi too.  Midfetch filters may want to leverage
 582:        // info in here.
 583:        curi.putObject(A_HTTP_TRANSACTION, method);
 584:    }
 585:
 586:    /**
 587:     * Set the character encoding based on the result headers or default.
 588:     *
 589:     * The HttpClient returns its own default encoding ("ISO-8859-1") if one
 590:     * isn't specified in the Content-Type response header. We give the user
 591:     * the option of overriding this, so we need to detect the case where the
 592:     * default is returned.
 593:     *
 594:     * Now, it may well be the case that the default returned by HttpClient
 595:     * and the default defined by the user are the same.
 596:     * 
 597:     * @param rec Recorder for this request.
 598:     * @param method Method used for the request.
 599:     */
 600:    private void setCharacterEncoding(final HttpRecorder rec,
 601:	          final HttpMethod method) {
 602:        String encoding = null;
 603:
 604:	          try {
 605:            encoding = ((HttpMethodBase) method).getResponseCharSet();
 606:            if (encoding == null ||
 607:	                      encoding.equals(DEFAULT_CONTENT_CHARSET)) {
 608:                encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
 609:            }
 610:        } catch (Exception e) {
 611:            logger.warning("Failed get default encoding: " +
 612:                e.getLocalizedMessage());
 613:        }
 614:        rec.setCharacterEncoding(encoding);
 615:    }
 616:
 617:    /**
 618:     * Cleanup after a failed method execute.
 619:     * @param curi CrawlURI we failed on.
 620:     * @param method Method we failed on.
 621:     * @param exception Exception we failed with.
 622:     */
 623:    private void failedExecuteCleanup(final HttpMethod method,
 624:	              final CrawlURI curi, final Exception exception) {
 625:        cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
 626:        method.releaseConnection();
 627:    }
 628:    
 629:    /**
 630:     * Cleanup after a failed method execute.
 631:     * @param curi CrawlURI we failed on.
 632:     * @param exception Exception we failed with.
 633:     * @param message Message to log with failure.
 634:     * @param status Status to set on the fetch.
 635:     */
 636:    private void cleanup(final CrawlURI curi, final Exception exception,
 637:	              final String message, final int status) {
 638:        curi.addLocalizedError(this.getName(), exception, message);
 639:        curi.setFetchStatus(status);
 640:        curi.getHttpRecorder().close();
 641:    }
 642:
 643:    /**
 644:     * Can this processor fetch the given CrawlURI. May set a fetch
 645:     * status if this processor would usually handle the CrawlURI,
 646:     * but cannot in this instance.
 647:     *
 648:     * @param curi
 649:     * @return True if processor can fetch.
 650:     */
 651:	      private boolean canFetch(CrawlURI curi) {
 652:	          if(curi.getFetchStatus()<0) {
 653:            // already marked as errored, this pass through
 654:            // skip to end
 655:            curi.skipToProcessorChain(getController().getPostprocessorChain());
 656:            return false;             
 657:        }
 658:        String scheme = curi.getUURI().getScheme();
 659:	           if (!(scheme.equals("http") || scheme.equals("https"))) {
 660:             // handles only plain http and https
 661:             return false;
 662:         }
 663:         CrawlHost host = getController().getServerCache().getHostFor(curi);
 664:         // make sure the dns lookup succeeded
 665:	           if (host.getIP() == null && host.hasBeenLookedUp()) {
 666:             curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
 667:             return false;
 668:         }
 669:        return true;
 670:    }
 671:
 672:    /**
 673:     * Configure the HttpMethod setting options and headers.
 674:     *
 675:     * @param curi CrawlURI from which we pull configuration.
 676:     * @param method The Method to configure.
 677:     */
 678:	      protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
 679:        // Don't auto-follow redirects
 680:        method.setFollowRedirects(false);
 681:        
 682://        // set soTimeout
 683://        method.getParams().setSoTimeout(
 684://                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
 685://                        .intValue());
 686:        
 687:        // Set cookie policy.
 688:        method.getParams().setCookiePolicy(
 689:            (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).
 690:                booleanValue())?
 691:                    CookiePolicy.IGNORE_COOKIES:
 692:                CookiePolicy.BROWSER_COMPATIBILITY);
 693:
 694:        // Use only HTTP/1.0 (to avoid receiving chunked responses)
 695:        method.getParams().setVersion(HttpVersion.HTTP_1_0);
 696:
 697:        CrawlOrder order = getSettingsHandler().getOrder();
 698:        String userAgent = curi.getUserAgent();
 699:	          if (userAgent == null) {
 700:            userAgent = order.getUserAgent(curi);
 701:        }
 702:        method.setRequestHeader("User-Agent", userAgent);
 703:        method.setRequestHeader("From", order.getFrom(curi));
 704:        
 705:        // Set retry handler.
 706:        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
 707:            new HeritrixHttpMethodRetryHandler());
 708:        
 709:        final long maxLength = getMaxLength(curi);
 710:        if(maxLength > 0 &&
 711:                ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).
 712:	                      booleanValue()) {
 713:            method.addRequestHeader(RANGE,
 714:                RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
 715:        }
 716:        
 717:        if (((Boolean)getUncheckedAttribute(curi,
 718:	                  ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
 719:            method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
 720:        }
 721:        
 722:        if (((Boolean)getUncheckedAttribute(curi,
 723:	                  ATTR_SEND_REFERER)).booleanValue()) {
 724:            // RFC2616 says no referer header if referer is https and the url
 725:            // is not
 726:            String via = curi.flattenVia();
 727:            if (via != null && via.length() > 0 &&
 728:                !(via.startsWith(HTTPS_SCHEME) &&
 729:	                      curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
 730:                method.setRequestHeader(REFERER, via);
 731:            }
 732:        }
 733:        
 734:        // TODO: What happens if below method adds a header already
 735:        // added above: e.g. Connection, Range, or Referer?
 736:        setAcceptHeaders(curi, method);
 737:        
 738:        return configureProxy(curi);
 739:    }
 740:
 741:    /**
 742:     * Setup proxy, based on attributes in CrawlURI and settings, 
 743:     * for this CrawlURI only. 
 744:     * @return HostConfiguration customized as necessary, or null if no
 745:     * customization required
 746:     */
 747:	      private HostConfiguration configureProxy(CrawlURI curi) {
 748:        String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
 749:        int port = -1; 
 750:	          if(proxy.length()==0) {
 751:            proxy = null; 
 752:        } else {
 753:            String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
 754:            port = portString.length()>0 ? Integer.parseInt(portString) : -1; 
 755:        }
 756:        HostConfiguration config = this.http.getHostConfiguration();
 757:	          if(config.getProxyHost() == proxy && config.getProxyPort() == port) {
 758:            // no change
 759:            return null; 
 760:        }
 761:        if (proxy != null && proxy.equals(config.getProxyHost()) 
 762:	                  && config.getProxyPort() == port) {
 763:            // no change
 764:            return null; 
 765:        }
 766:        config = new HostConfiguration(config); // copy of config
 767:        config.setProxy(proxy,port);
 768:        return config; 
 769:    }
 770:
 771:    /**
 772:     * Get a value either from inside the CrawlURI instance, or from 
 773:     * settings (module attributes). 
 774:     * 
 775:     * @param curi CrawlURI to consult
 776:     * @param key key to lookup
 777:     * @return value from either CrawlURI (preferred) or settings
 778:     */
 779:	      protected Object getAttributeEither(CrawlURI curi, String key) {
 780:        Object obj = curi!=null ? curi.getObject(key) : null;
 781:	          if(obj==null) {
 782:            obj = getUncheckedAttribute(curi, key);
 783:        }
 784:        return obj;
 785:    }
 786:
 787:    /**
 788:     * Add credentials if any to passed <code>method</code>.
 789:     *
 790:     * Do credential handling.  Credentials are in two places.  1. Credentials
 791:     * that succeeded are added to the CrawlServer (Or rather, avatars for
 792:     * credentials are whats added because its not safe to keep around
 793:     * references to credentials).  2. Credentials to be tried are in the curi.
 794:     * Returns true if found credentials to be tried.
 795:     *
 796:     * @param curi Current CrawlURI.
 797:     * @param method The method to add to.
 798:     * @return True if prepopulated <code>method</code> with credentials AND the
 799:     * credentials came from the <code>curi</code>, not from the CrawlServer.
 800:     * The former is  special in that if the <code>curi</curi> credentials
 801:     * succeed, then the caller needs to promote them from the CrawlURI to the
 802:     * CrawlServer so they are available for all subsequent CrawlURIs on this
 803:     * server.
 804:     */
 805:	      private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
 806:        // First look at the server avatars. Add any that are to be volunteered
 807:        // on every request (e.g. RFC2617 credentials).  Every time creds will
 808:        // return true when we call 'isEveryTime().
 809:        CrawlServer server =
 810:            getController().getServerCache().getServerFor(curi);
 811:	          if (server.hasCredentialAvatars()) {
 812:            Set avatars = server.getCredentialAvatars();
 813:	              for (Iterator i = avatars.iterator(); i.hasNext();) {
 814:                CredentialAvatar ca = (CredentialAvatar)i.next();
 815:                Credential c = ca.getCredential(getSettingsHandler(), curi);
 816:	                  if (c.isEveryTime()) {
 817:                    c.populate(curi, this.http, method, ca.getPayload());
 818:                }
 819:            }
 820:        }
 821:
 822:        boolean result = false;
 823:
 824:        // Now look in the curi.  The Curi will have credentials loaded either
 825:        // by the handle401 method if its a rfc2617 or it'll have been set into
 826:        // the curi by the preconditionenforcer as this login uri came through.
 827:	          if (curi.hasCredentialAvatars()) {
 828:            Set avatars = curi.getCredentialAvatars();
 829:	              for (Iterator i = avatars.iterator(); i.hasNext();) {
 830:                CredentialAvatar ca = (CredentialAvatar)i.next();
 831:                Credential c = ca.getCredential(getSettingsHandler(), curi);
 832:	                  if (c.populate(curi, this.http, method, ca.getPayload())) {
 833:                    result = true;
 834:                }
 835:            }
 836:        }
 837:
 838:        return result;
 839:    }
 840:
 841:    /**
 842:     * Promote successful credential to the server.
 843:     *
 844:     * @param curi CrawlURI whose credentials we are to promote.
 845:     */
 846:	      private void promoteCredentials(final CrawlURI curi) {
 847:	          if (!curi.hasCredentialAvatars()) {
 848:            logger.severe("No credentials to promote when there should be " +
 849:                curi);
 850:        } else {
 851:            Set avatars = curi.getCredentialAvatars();
 852:	              for (Iterator i = avatars.iterator(); i.hasNext();) {
 853:                CredentialAvatar ca = (CredentialAvatar)i.next();
 854:                curi.removeCredentialAvatar(ca);
 855:                // The server to attach too may not be the server that hosts
 856:                // this passed curi.  It might be of another subdomain.
 857:                // The avatar needs to be added to the server that is dependent
 858:                // on this precondition.  Find it by name.  Get the name from
 859:                // the credential this avatar represents.
 860:                Credential c = ca.getCredential(getSettingsHandler(), curi);
 861:                String cd = null;
 862:	                  try {
 863:                    cd = c.getCredentialDomain(curi);
 864:                }
 865:	                  catch (AttributeNotFoundException e) {
 866:                    logger.severe("Failed to get cred domain for " + curi +
 867:                        " for " + ca + ": " + e.getMessage());
 868:                }
 869:	                  if (cd != null) {
 870:                    CrawlServer cs
 871:                        = getController().getServerCache().getServerFor(cd);
 872:	                      if (cs != null) {
 873:                        cs.addCredentialAvatar(ca);
 874:                    }
 875:                }
 876:            }
 877:        }
 878:    }
 879:
 880:    /**
 881:     * Server is looking for basic/digest auth credentials (RFC2617). If we have
 882:     * any, put them into the CrawlURI and have it come around again. Presence
 883:     * of the credential serves as flag to frontier to requeue promptly. If we
 884:     * already tried this domain and still got a 401, then our credentials are
 885:     * bad. Remove them and let this curi die.
 886:     *
 887:     * @param method Method that got a 401.
 888:     * @param curi CrawlURI that got a 401.
 889:     */
 890:	      protected void handle401(final HttpMethod method, final CrawlURI curi) {
 891:        AuthScheme authscheme = getAuthScheme(method, curi);
 892:	          if (authscheme == null) {
 893:            return;
 894:        }
 895:        String realm = authscheme.getRealm();
 896:        
 897:        // Look to see if this curi had rfc2617 avatars loaded.  If so, are
 898:        // any of them for this realm?  If so, then the credential failed
 899:        // if we got a 401 and it should be let die a natural 401 death.
 900:        Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),
 901:                curi, Rfc2617Credential.class);
 902:        Rfc2617Credential extant = Rfc2617Credential.
 903:            getByRealm(curiRfc2617Credentials, realm, curi);
 904:	          if (extant != null) {
 905:            // Then, already tried this credential.  Remove ANY rfc2617
 906:            // credential since presence of a rfc2617 credential serves
 907:            // as flag to frontier to requeue this curi and let the curi
 908:            // die a natural death.
 909:            extant.detachAll(curi);
 910:            logger.warning("Auth failed (401) though supplied realm " +
 911:                    realm + " to " + curi.toString());
 912:        } else {
 913:            // Look see if we have a credential that corresponds to this
 914:            // realm in credential store.  Filter by type and credential
 915:            // domain.  If not, let this curi die. Else, add it to the
 916:            // curi and let it come around again. Add in the AuthScheme
 917:            // we got too.  Its needed when we go to run the Auth on
 918:            // second time around.
 919:            CredentialStore cs =
 920:                CredentialStore.getCredentialStore(getSettingsHandler());
 921:	              if (cs == null) {
 922:                logger.severe("No credential store for " + curi);
 923:            } else {
 924:                CrawlServer server = getController().getServerCache().
 925:                    getServerFor(curi);
 926:                Set storeRfc2617Credentials = cs.subset(curi,
 927:                    Rfc2617Credential.class, server.getName());
 928:                if (storeRfc2617Credentials == null ||
 929:	                          storeRfc2617Credentials.size() <= 0) {
 930:                    logger.info("No rfc2617 credentials for " + curi);
 931:                } else {
 932:                    Rfc2617Credential found = Rfc2617Credential.
 933:                        getByRealm(storeRfc2617Credentials, realm, curi);
 934:	                      if (found == null) {
 935:                        logger.info("No rfc2617 credentials for realm " +
 936:                                realm + " in " + curi);
 937:                    } else {
 938:                        found.attach(curi, authscheme.getRealm());
 939:                        logger.info("Found credential for realm " + realm +
 940:                            " in store for " + curi.toString());
 941:                    }
 942:                }
 943:            }
 944:        }
 945:    }
 946:    
 947:    /**
 948:     * @param method Method that got a 401.
 949:     * @param curi CrawlURI that got a 401.
 950:     * @return Returns first wholesome authscheme found else null.
 951:     */
 952:    protected AuthScheme getAuthScheme(final HttpMethod method,
 953:	              final CrawlURI curi) {
 954:        Header [] headers = method.getResponseHeaders("WWW-Authenticate");
 955:	          if (headers == null || headers.length <= 0) {
 956:            logger.info("We got a 401 but no WWW-Authenticate challenge: " +
 957:                curi.toString());
 958:            return null;
 959:        }
 960:
 961:        Map authschemes = null;
 962:	          try {
 963:            authschemes = AuthChallengeParser.parseChallenges(headers);
 964:        } catch(MalformedChallengeException e) {
 965:            logger.info("Failed challenge parse: " + e.getMessage());
 966:        }
 967:	          if (authschemes == null || authschemes.size() <= 0) {
 968:            logger.info("We got a 401 and WWW-Authenticate challenge" +
 969:                " but failed parse of the header " + curi.toString());
 970:            return null;
 971:        }            
 972:         
 973:        AuthScheme result = null;
 974:        // Use the first auth found.
 975:        for (Iterator i = authschemes.keySet().iterator();
 976:	                  result == null && i.hasNext();) {
 977:            String key = (String)i.next();
 978:            String challenge = (String)authschemes.get(key);
 979:            if (key == null || key.length() <= 0 || challenge == null ||
 980:	                    challenge.length() <= 0) {
 981:                logger.warning("Empty scheme: " + curi.toString() +
 982:                  ": " + headers);
 983:            }
 984:            AuthScheme authscheme = null;
 985:	              if (key.equals("basic")) {
 986:                authscheme = new BasicScheme();
 987:            } else if (key.equals("digest")) {
 988:                authscheme = new DigestScheme();
 989:            } else {
 990:                logger.info("Unsupported scheme: " + key);
 991:                continue;
 992:            }
 993:            
 994:	              try {
 995:                authscheme.processChallenge(challenge);
 996:            } catch (MalformedChallengeException e) {
 997:                logger.info(e.getMessage() + " " + curi + " " + headers);
 998:                continue;
 999:            }
1000:	              if (authscheme.isConnectionBased()) {
1001:                logger.info("Connection based " + authscheme);
1002:                continue;
1003:            }
1004:            
1005:            if (authscheme.getRealm() == null ||
1006:	                      authscheme.getRealm().length() <= 0) {
1007:                logger.info("Empty realm " + authscheme + " for " + curi);
1008:                continue;
1009:            }
1010:            result = authscheme;
1011:        }
1012:        
1013:        return result;
1014:    }
1015:        
1016:    /**
1017:     * @param handler Settings Handler.
1018:     * @param curi CrawlURI that got a 401.
1019:     * @param type Class of credential to get from curi.
1020:     * @return Set of credentials attached to this curi.
1021:     */
1022:    private Set<Credential> getCredentials(SettingsHandler handler, 
1023:	              CrawlURI curi, Class type) {
1024:        Set<Credential> result = null;
1025:
1026:	          if (curi.hasCredentialAvatars()) {
1027:            for (Iterator i = curi.getCredentialAvatars().iterator();
1028:	                      i.hasNext();) {
1029:                CredentialAvatar ca = (CredentialAvatar)i.next();
1030:	                  if (ca.match(type)) {
1031:	                      if (result == null) {
1032:                        result = new HashSet<Credential>();
1033:                    }
1034:                    result.add(ca.getCredential(handler, curi));
1035:                }
1036:            }
1037:        }
1038:        return result;
1039:    }
1040:
1041:	      public void initialTasks() {
1042:        super.initialTasks();
1043:        this.getController().addCrawlStatusListener(this);
1044:        configureHttp();
1045:
1046:        // load cookies from a file if specified in the order file.
1047:        loadCookies();
1048:
1049:        // I tried to get the default KeyManagers but doesn't work unless you
1050:        // point at a physical keystore. Passing null seems to do the right
1051:        // thing so we'll go w/ that.
1052:	          try {
1053:            SSLContext context = SSLContext.getInstance("SSL");
1054:	              context.init(null, new TrustManager[] {
1055:                new ConfigurableX509TrustManager((String)
1056:                    getAttribute(ATTR_TRUST))}, null);
1057:            this.sslfactory = context.getSocketFactory();
1058:        } catch (Exception e) {
1059:            logger.log(Level.WARNING, "Failed configure of ssl context "
1060:                + e.getMessage(), e);
1061:        }
1062:    }
1063:    
1064:	      public void finalTasks() {
1065:        // At the end save cookies to the file specified in the order file.
1066:        saveCookies();
1067:        cleanupHttp();
1068:        super.finalTasks();
1069:    }
1070:
1071:    /**
1072:     * Perform any final cleanup related to the HttpClient instance.
1073:     */
1074:	      protected void cleanupHttp() {
1075:	          if(cookieDb!=null) {
1076:	              try {
1077:                cookieDb.close();
1078:            } catch (DatabaseException e) {
1079:                // TODO Auto-generated catch block
1080:                e.printStackTrace();
1081:            }
1082:        }
1083:    }
1084:
1085:	      protected void configureHttp() throws RuntimeException {
1086:        // Get timeout.  Use it for socket and for connection timeout.
1087:        int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;
1088:        
1089:        // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
1090:        HttpConnectionManager cm = new SingleHttpConnectionManager();
1091:        
1092:        // TODO: The following settings should be made in the corresponding
1093:        // HttpConnectionManager, not here.
1094:        HttpConnectionManagerParams hcmp = cm.getParams();
1095:        hcmp.setConnectionTimeout(timeout);
1096:        hcmp.setStaleCheckingEnabled(true);
1097:        // Minimizes bandwidth usage.  Setting to true disables Nagle's
1098:        // algorithm.  IBM JVMs < 142 give an NPE setting this boolean
1099:        // on ssl sockets.
1100:        hcmp.setTcpNoDelay(false);
1101:        
1102:        this.http = new HttpClient(cm);
1103:        HttpClientParams hcp = this.http.getParams();
1104:        // Set default socket timeout.
1105:        hcp.setSoTimeout(timeout);
1106:        // Set client to be version 1.0.
1107:        hcp.setVersion(HttpVersion.HTTP_1_0);
1108:
1109:        String addressStr = null;
1110:	          try {
1111:            addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS);
1112:        } catch (Exception e1) {
1113:            // If exception, just use default.
1114:        }
1115:	          if (addressStr != null && addressStr.length() > 0) {
1116:	              try {
1117:                InetAddress localAddress = InetAddress.getByName(addressStr);
1118:                this.http.getHostConfiguration().setLocalAddress(localAddress);
1119:            } catch (UnknownHostException e) {
1120:                // Convert all to RuntimeException so get an exception out
1121:                // if initialization fails.
1122:                throw new RuntimeException("Unknown host " + addressStr
1123:                    + " in " + ATTR_LOCAL_ADDRESS);
1124:            }
1125:        }
1126:
1127:        configureHttpCookies();
1128:        
1129:        // Configure how we want the method to act.
1130:        this.http.getParams().setParameter(
1131:            HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));
1132:        this.http.getParams().setParameter(
1133:            HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false));
1134:        this.http.getParams().setParameter(
1135:            HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));
1136:        this.http.getParams().setIntParameter(
1137:            HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1138:        
1139:        HostConfiguration configOrNull = configureProxy(null);
1140:	          if(configOrNull!=null) {
1141:            // global proxy settings are in effect
1142:            this.http.setHostConfiguration(configOrNull);
1143:        }
1144:        
1145:        // Use our own protocol factory, one that gets IP to use from
1146:        // heritrix cache (They're cached in CrawlHost instances).
1147:        final ServerCache cache = getController().getServerCache();
1148:        hcmp.setParameter(SERVER_CACHE_KEY, cache);
1149:        hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);
1150:    }
1151:
1152:    /**
1153:     * Set the HttpClient HttpState instance to use a BDB-backed
1154:     * StoredSortedMap for cookie storage, if that option is chosen.
1155:     */
1156:	      private void configureHttpCookies() {
1157:        // If Bdb-backed cookies chosen, replace map in HttpState
1158:        if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).
1159:	                  booleanValue()) {
1160:	              try {
1161:                Environment env = getController().getBdbEnvironment();
1162:                StoredClassCatalog classCatalog = getController().getClassCatalog();
1163:                DatabaseConfig dbConfig = new DatabaseConfig();
1164:                dbConfig.setTransactional(false);
1165:                dbConfig.setAllowCreate(true);
1166:                cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);
1167:                StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,
1168:                        new StringBinding(), new SerialBinding(classCatalog,
1169:                                Cookie.class), true);
1170:                this.http.getState().setCookiesMap(cookiesMap);
1171:            } catch (DatabaseException e) {
1172:                // TODO Auto-generated catch block
1173:                logger.severe(e.getMessage());
1174:                e.printStackTrace();
1175:            }
1176:        }
1177:    }
1178:
1179:    /**
1180:     * @param curi Current CrawlURI.  Used to get context.
1181:     * @return Socket timeout value.
1182:     */
1183:	      private int getSoTimeout(CrawlURI curi) {
1184:        Integer res = null;
1185:	          try {
1186:            res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1187:        } catch (Exception e) {
1188:            res = DEFAULT_SOTIMEOUT_MS;
1189:        }
1190:        return res.intValue();
1191:    }
1192:
1193:    /**
1194:     * @param curi Current CrawlURI.  Used to get context.
1195:     * @return Timeout value for total request.
1196:     */
1197:	      private int getTimeout(CrawlURI curi) {
1198:        Integer res;
1199:	          try {
1200:            res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1201:        } catch (Exception e) {
1202:            res = DEFAULT_TIMEOUT_SECONDS;
1203:        }
1204:        return res.intValue();
1205:    }
1206:
1207:	      private int getMaxFetchRate(CrawlURI curi) {
1208:        Integer res;
1209:	          try {
1210:            res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1211:        }
1212:	          catch (Exception e) {
1213:            res = DEFAULT_FETCH_BANDWIDTH_MAX;
1214:        }
1215:        return res.intValue();
1216:    }
1217:
1218:	      private long getMaxLength(CrawlURI curi) {
1219:        Long res;
1220:	          try {
1221:            res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1222:	              if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1223:                res = DEFAULT_MAX_LENGTH_BYTES;
1224:            }
1225:        } catch (Exception e) {
1226:            res = DEFAULT_MAX_LENGTH_BYTES;
1227:        }
1228:        return res.longValue();
1229:    }
1230:
1231:    /**
1232:     * Load cookies from a file before the first fetch.
1233:     * <p>
1234:     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1235:     * Example entry of cookies.txt file:<br>
1236:     * <br>
1237:     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1238:     * <br>
1239:     * Each line has 7 tab-separated fields:<br>
1240:     * <li>1. DOMAIN: The domain that created and have access to the cookie
1241:     * value.
1242:     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1243:     * domain can access the cookie value.
1244:     * <li>3. PATH: The path within the domain that the cookie value is valid
1245:     * for.
1246:     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1247:     * connection to access the cookie value.
1248:     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1249:     * <li>6. NAME: The name of the cookie value
1250:     * <li>7. VALUE: The cookie value
1251:     *
1252:     * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1253:     */
1254:	      public void loadCookies(String cookiesFile) {
1255:        // Do nothing if cookiesFile is not specified.
1256:	          if (cookiesFile == null || cookiesFile.length() <= 0) {
1257:            return;
1258:        }
1259:        RandomAccessFile raf = null;
1260:	          try {
1261:            raf = new RandomAccessFile(cookiesFile, "r");
1262:            String[] cookieParts;
1263:            String line;
1264:            Cookie cookie = null;
1265:	              while ((line = raf.readLine()) != null) {
1266:                // Line that starts with # is commented line, therefore skip it.
1267:	                  if (!line.startsWith("#")) {
1268:                    cookieParts = line.split("\\t");
1269:	                      if (cookieParts.length == 7) {
1270:                        // Create cookie with not expiration date (-1 value).
1271:                        // TODO: add this as an option.
1272:                        cookie =
1273:                            new Cookie(cookieParts[0], cookieParts[5],
1274:                                cookieParts[6], cookieParts[2], -1,
1275:                                Boolean.valueOf(cookieParts[3]).booleanValue());
1276:
1277:	                          if (cookieParts[1].toLowerCase().equals("true")) {
1278:                            cookie.setDomainAttributeSpecified(true);
1279:                        } else {
1280:                            cookie.setDomainAttributeSpecified(false);
1281:                        }
1282:                        this.http.getState().addCookie(cookie);
1283:                        logger.fine(
1284:                            "Adding cookie: " + cookie.toExternalForm());
1285:                    }
1286:                }
1287:            }
1288:        } catch (FileNotFoundException e) {
1289:            // We should probably throw FatalConfigurationException.
1290:            System.out.println("Could not find file: " + cookiesFile
1291:                    + " (Element: " + ATTR_LOAD_COOKIES + ")");
1292:
1293:        } catch (IOException e) {
1294:            // We should probably throw FatalConfigurationException.
1295:            e.printStackTrace();
1296:        } finally {
1297:	              try {
1298:	                  if (raf != null) {
1299:                    raf.close();
1300:                }
1301:            } catch (IOException e) {
1302:                e.printStackTrace();
1303:            }
1304:        }
1305:    }
1306:
1307:    /* (non-Javadoc)
1308:     * @see org.archive.crawler.framework.Processor#report()
1309:     */
1310:	      public String report() {
1311:        StringBuffer ret = new StringBuffer();
1312:        ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1313:        ret.append("  Function:          Fetch HTTP URIs\n");
1314:        ret.append("  CrawlURIs handled: " + this.curisHandled + "\n");
1315:        ret.append("  Recovery retries:   " + this.recoveryRetries + "\n\n");
1316:
1317:        return ret.toString();
1318:    }
1319:
1320:
1321:    /**
1322:     * Load cookies from the file specified in the order file.
1323:     *
1324:     * <p>
1325:     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1326:     * Example entry of cookies.txt file:<br>
1327:     * <br>
1328:     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1329:     * <br>
1330:     * Each line has 7 tab-separated fields:<br>
1331:     * <li>1. DOMAIN: The domain that created and have access to the cookie
1332:     * value.
1333:     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1334:     * domain can access the cookie value.
1335:     * <li>3. PATH: The path within the domain that the cookie value is valid
1336:     * for.
1337:     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1338:     * connection to access the cookie value.
1339:     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1340:     * <li>6. NAME: The name of the cookie value
1341:     * <li>7. VALUE: The cookie value
1342:     */
1343:	      public void loadCookies() {
1344:	          try {
1345:            loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
1346:        } catch (MBeanException e) {
1347:            logger.warning(e.getLocalizedMessage());
1348:        } catch (ReflectionException e) {
1349:            logger.warning(e.getLocalizedMessage());
1350:        } catch (AttributeNotFoundException e) {
1351:            logger.warning(e.getLocalizedMessage());
1352:        }
1353:    }
1354:    /**
1355:     * Saves cookies to the file specified in the order file.
1356:     *
1357:     * Output file is in the Netscape 'cookies.txt' format.
1358:     *
1359:     */
1360:	      public void saveCookies() {
1361:	          try {
1362:            saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
1363:        } catch (MBeanException e) {
1364:            logger.warning(e.getLocalizedMessage());
1365:        } catch (ReflectionException e) {
1366:            logger.warning(e.getLocalizedMessage());
1367:        } catch (AttributeNotFoundException e) {
1368:            logger.warning(e.getLocalizedMessage());
1369:        }
1370:    }
1371:    /**
1372:     * Saves cookies to a file.
1373:     *
1374:     * Output file is in the Netscape 'cookies.txt' format.
1375:     *
1376:     * @param saveCookiesFile output file.
1377:     */
1378:	      public void saveCookies(String saveCookiesFile) {
1379:        // Do nothing if cookiesFile is not specified.
1380:	          if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1381:            return;
1382:        }
1383:
1384:        FileOutputStream out = null;
1385:	          try {
1386:            out = new FileOutputStream(new File(saveCookiesFile));
1387:            @SuppressWarnings("unchecked")
1388:            Map<String,Cookie> cookies = http.getState().getCookiesMap();
1389:            String tab ="\t";
1390:            out.write("# Heritrix Cookie File\n".getBytes());
1391:            out.write(
1392:                "# This file is the Netscape cookies.txt format\n\n".getBytes());
1393:	              for (Cookie cookie: cookies.values()) {
1394:                MutableString line =
1395:                    new MutableString(1024 * 2 /*Guess an initial size*/);
1396:                line.append(cookie.getDomain());
1397:                line.append(tab);
1398:                line.append(
1399:                    cookie.isDomainAttributeSpecified() == true
1400:                        ? "TRUE"
1401:                        : "FALSE");
1402:                line.append(tab);
1403:                line.append(cookie.getPath());
1404:                line.append(tab);
1405:                line.append(
1406:                    cookie.getSecure() == true ? "TRUE" : "FALSE");
1407:                line.append(tab);
1408:                line.append(cookie.getName());
1409:                line.append(tab);
1410:                line.append((null==cookie.getValue())?"":cookie.getValue());
1411:                line.append("\n");
1412:                out.write(line.toString().getBytes());
1413:            }
1414:        } catch (FileNotFoundException e) {
1415:            // We should probably throw FatalConfigurationException.
1416:            System.out.println("Could not find file: " + saveCookiesFile
1417:                    + " (Element: " + ATTR_SAVE_COOKIES + ")");
1418:        } catch (IOException e) {
1419:            e.printStackTrace();
1420:        } finally {
1421:	              try {
1422:	                  if (out != null) {
1423:                    out.close();
1424:                }
1425:            } catch (IOException e) {
1426:                e.printStackTrace();
1427:            }
1428:        }
1429:    }
1430:
1431:    /* (non-Javadoc)
1432:     * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
1433:     */
1434:	      protected void listUsedFiles(List<String> list) {
1435:        // List the cookies files
1436:        // Add seed file
1437:	          try {
1438:            String tmp = (String)getAttribute(ATTR_LOAD_COOKIES);
1439:	              if(tmp != null && tmp.length() > 0){
1440:                File file = getSettingsHandler().
1441:                        getPathRelativeToWorkingDirectory(tmp);
1442:                list.add(file.getAbsolutePath());
1443:            }
1444:            tmp = (String)getAttribute(ATTR_SAVE_COOKIES);
1445:	              if(tmp != null && tmp.length() > 0){
1446:                File file = getSettingsHandler().
1447:                        getPathRelativeToWorkingDirectory(tmp);
1448:                list.add(file.getAbsolutePath());
1449:            }
1450:        } catch (AttributeNotFoundException e) {
1451:            // TODO Auto-generated catch block
1452:            e.printStackTrace();
1453:        } catch (MBeanException e) {
1454:            // TODO Auto-generated catch block
1455:            e.printStackTrace();
1456:        } catch (ReflectionException e) {
1457:            // TODO Auto-generated catch block
1458:            e.printStackTrace();
1459:        }
1460:    }
1461:    
1462:	      private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1463:	          try {
1464:            StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);
1465:	              if (!accept_headers.isEmpty()) {
1466:	                  for (ListIterator i = accept_headers.listIterator(); i.hasNext();) {
1467:                    String hdr = (String) i.next();
1468:                    String[] nvp = hdr.split(": +");
1469:	                      if (nvp.length == 2) {
1470:                        get.setRequestHeader(nvp[0], nvp[1]);
1471:                    }
1472:	                      else {
1473:                        logger.warning("Invalid accept header: " + hdr);
1474:                    }
1475:                }
1476:            }
1477:        }
1478:	          catch (AttributeNotFoundException e) {
1479:            logger.severe(e.getMessage());
1480:        }
1481:    }
1482:
1483:    // custom serialization
1484:	      private void writeObject(ObjectOutputStream stream) throws IOException {
1485:        stream.defaultWriteObject();
1486:        // save cookies
1487:        @SuppressWarnings("unchecked")
1488:        Collection<Cookie> c = http.getState().getCookiesMap().values();
1489:        Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1490:        stream.writeObject(cookies);
1491:    }
1492:    
1493:	      private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
1494:        stream.defaultReadObject();
1495:        Cookie cookies[] = (Cookie[]) stream.readObject();
1496:        ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream)stream;
1497:        coistream.registerFinishTask( new PostRestore(cookies) );
1498:    }
1499:    
1500:    /**
1501:     * @return Returns the http instance.
1502:     */
1503:	      protected HttpClient getHttp() {
1504:        return this.http;
1505:    }
1506:    
1507:	      class PostRestore implements Runnable {
1508:        Cookie cookies[];
1509:	          public PostRestore(Cookie cookies[]) {
1510:            this.cookies = cookies;
1511:        }
1512:	          public void run() {
1513:            configureHttp();
1514:	              for(int i = 0; i < cookies.length; i++) {
1515:                getHttp().getState().addCookie(cookies[i]);
1516:            }
1517:        }
1518:    }
1519:
1520:    /* (non-Javadoc)
1521:     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1522:     */
1523:	      public void crawlStarted(String message) {
1524:        // TODO Auto-generated method stub
1525:    }
1526:    
1527:    /* (non-Javadoc)
1528:     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1529:     */
1530:	      public void crawlCheckpoint(File checkpointDir) {
1531:        // TODO Auto-generated method stub
1532:    }
1533:
1534:    /* (non-Javadoc)
1535:     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1536:     */
1537:	      public void crawlEnding(String sExitMessage) {
1538:        // TODO Auto-generated method stub
1539:    }
1540:
1541:    /* (non-Javadoc)
1542:     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1543:     */
1544:	      public void crawlEnded(String sExitMessage) {
1545:        this.http = null;
1546:        this.midfetchfilters = null;
1547:    }
1548:
1549:    /* (non-Javadoc)
1550:     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1551:     */
1552:	      public void crawlPausing(String statusMessage) {
1553:        // TODO Auto-generated method stub
1554:    }
1555:
1556:    /* (non-Javadoc)
1557:     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1558:     */
1559:	      public void crawlPaused(String statusMessage) {
1560:        // TODO Auto-generated method stub
1561:    }
1562:
1563:    /* (non-Javadoc)
1564:     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1565:     */
1566:	      public void crawlResuming(String statusMessage) {
1567:        // TODO Auto-generated method stub
1568:    }
1569:}