Merge "Omaha #3210 - moving foss from cots/ into AWIPS2_foss repository" into omaha_14.4.1
Former-commit-id:0c8e7e05bd
[formerlyfca34641a0
[formerly 933f04911ad40e66f49da96c872306f4e89a2eca]] Former-commit-id:fca34641a0
Former-commit-id:af040cffa8
This commit is contained in:
commit
3d7e4bc0ba
589 changed files with 0 additions and 65548 deletions
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry exported="true" kind="lib" path="jna-4.1.0.jar"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>com.sun.jna</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,7 +0,0 @@
|
|||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,10 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: JNA FOSS
|
||||
Bundle-SymbolicName: com.sun.jna
|
||||
Bundle-Version: 4.1.0
|
||||
Bundle-ClassPath: jna-4.1.0.jar
|
||||
Export-Package: com.sun.jna,
|
||||
com.sun.jna.ptr,
|
||||
com.sun.jna.win32
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
|
@ -1,2 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
jna-4.1.0.jar
|
Binary file not shown.
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry exported="true" kind="lib" path="jaxb-impl-2.1.9.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>com.sun.xml.bind</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,8 +0,0 @@
|
|||
#Fri Jun 08 12:02:53 CDT 2012
|
||||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,37 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Bind
|
||||
Bundle-SymbolicName: com.sun.xml.bind
|
||||
Bundle-Version: 1.0.0.qualifier
|
||||
Bundle-ClassPath: jaxb-impl-2.1.9.jar
|
||||
Bundle-Vendor: SUN
|
||||
Export-Package: com.sun.istack,
|
||||
com.sun.istack.localization,
|
||||
com.sun.xml.bind,
|
||||
com.sun.xml.bind.annotation,
|
||||
com.sun.xml.bind.api,
|
||||
com.sun.xml.bind.api.impl,
|
||||
com.sun.xml.bind.marshaller,
|
||||
com.sun.xml.bind.unmarshaller,
|
||||
com.sun.xml.bind.util,
|
||||
com.sun.xml.bind.v2,
|
||||
com.sun.xml.bind.v2.bytecode,
|
||||
com.sun.xml.bind.v2.model.annotation,
|
||||
com.sun.xml.bind.v2.model.core,
|
||||
com.sun.xml.bind.v2.model.impl,
|
||||
com.sun.xml.bind.v2.model.nav,
|
||||
com.sun.xml.bind.v2.model.runtime,
|
||||
com.sun.xml.bind.v2.runtime,
|
||||
com.sun.xml.bind.v2.runtime.output,
|
||||
com.sun.xml.bind.v2.runtime.property,
|
||||
com.sun.xml.bind.v2.runtime.reflect,
|
||||
com.sun.xml.bind.v2.runtime.reflect.opt,
|
||||
com.sun.xml.bind.v2.runtime.unmarshaller,
|
||||
com.sun.xml.bind.v2.schemagen,
|
||||
com.sun.xml.bind.v2.schemagen.episode,
|
||||
com.sun.xml.bind.v2.schemagen.xmlschema,
|
||||
com.sun.xml.bind.v2.util,
|
||||
com.sun.xml.txw2,
|
||||
com.sun.xml.txw2.annotation,
|
||||
com.sun.xml.txw2.output
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
|
@ -1,3 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
.,\
|
||||
jaxb-impl-2.1.9.jar
|
Binary file not shown.
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry exported="true" kind="lib" path="JavaAPIforKml.jar" sourcepath="JavaAPIforKml-sources.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>de.micromata.opengis.kml</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,8 +0,0 @@
|
|||
#Wed May 30 18:56:22 CDT 2012
|
||||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
Binary file not shown.
Binary file not shown.
|
@ -1,13 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Kml
|
||||
Bundle-SymbolicName: de.micromata.opengis.kml
|
||||
Bundle-Version: 1.0.0.qualifier
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
||||
Bundle-ClassPath: JavaAPIforKml.jar,
|
||||
.
|
||||
Export-Package: de.micromata.opengis.kml.v_2_2_0,
|
||||
de.micromata.opengis.kml.v_2_2_0.annotations,
|
||||
de.micromata.opengis.kml.v_2_2_0.atom,
|
||||
de.micromata.opengis.kml.v_2_2_0.gx,
|
||||
de.micromata.opengis.kml.v_2_2_0.xal
|
|
@ -1,3 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
.,\
|
||||
JavaAPIforKml.jar
|
|
@ -1,17 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry exported="true" kind="lib" path="apache-mime4j-core-0.7.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="apache-mime4j-dom-0.7.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="asm-3.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="boilerpipe-1.1.0.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="je-4.0.92.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="metadata-extractor-2.4.0-beta-1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="tagsoup-1.2.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="tika-core-1.0.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="tika-parsers-1.0.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry kind="src" path="src"/>
|
||||
<classpathentry kind="src" path="resources"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>edu.uci.ics.crawler4j</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,8 +0,0 @@
|
|||
#Mon Feb 20 17:18:28 CST 2012
|
||||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,162 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Crawler4j
|
||||
Bundle-SymbolicName: edu.uci.ics.crawler4j
|
||||
Bundle-Version: 1.0.0.qualifier
|
||||
Bundle-ActivationPolicy: lazy
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
||||
Bundle-ClassPath: apache-mime4j-core-0.7.jar,
|
||||
apache-mime4j-dom-0.7.jar,
|
||||
asm-3.1.jar,
|
||||
boilerpipe-1.1.0.jar,
|
||||
je-4.0.92.jar,
|
||||
metadata-extractor-2.4.0-beta-1.jar,
|
||||
tagsoup-1.2.1.jar,
|
||||
tika-core-1.0.jar,
|
||||
tika-parsers-1.0.jar,
|
||||
.
|
||||
Require-Bundle: org.apache.commons.codec;bundle-version="1.4.0",
|
||||
org.apache.http;bundle-version="4.1.2",
|
||||
org.apache.commons.compress;bundle-version="1.5.0"
|
||||
Export-Package: com.drew.imaging,
|
||||
com.drew.imaging.jpeg,
|
||||
com.drew.imaging.tiff,
|
||||
com.drew.lang,
|
||||
com.drew.metadata,
|
||||
com.drew.metadata.exif,
|
||||
com.drew.metadata.iptc,
|
||||
com.drew.metadata.jpeg,
|
||||
com.sleepycat.asm,
|
||||
com.sleepycat.bind,
|
||||
com.sleepycat.bind.serial,
|
||||
com.sleepycat.bind.tuple,
|
||||
com.sleepycat.collections,
|
||||
com.sleepycat.compat,
|
||||
com.sleepycat.je,
|
||||
com.sleepycat.je.cleaner,
|
||||
com.sleepycat.je.config,
|
||||
com.sleepycat.je.dbi,
|
||||
com.sleepycat.je.evictor,
|
||||
com.sleepycat.je.incomp,
|
||||
com.sleepycat.je.jmx,
|
||||
com.sleepycat.je.jmx.plugin,
|
||||
com.sleepycat.je.latch,
|
||||
com.sleepycat.je.log,
|
||||
com.sleepycat.je.log.entry,
|
||||
com.sleepycat.je.recovery,
|
||||
com.sleepycat.je.rep,
|
||||
com.sleepycat.je.rep.elections,
|
||||
com.sleepycat.je.rep.impl,
|
||||
com.sleepycat.je.rep.impl.networkRestore,
|
||||
com.sleepycat.je.rep.impl.node,
|
||||
com.sleepycat.je.rep.jmx,
|
||||
com.sleepycat.je.rep.jmx.plugin,
|
||||
com.sleepycat.je.rep.monitor,
|
||||
com.sleepycat.je.rep.stream,
|
||||
com.sleepycat.je.rep.txn,
|
||||
com.sleepycat.je.rep.util,
|
||||
com.sleepycat.je.rep.util.ldiff,
|
||||
com.sleepycat.je.rep.utilint,
|
||||
com.sleepycat.je.rep.vlsn,
|
||||
com.sleepycat.je.tree,
|
||||
com.sleepycat.je.txn,
|
||||
com.sleepycat.je.util,
|
||||
com.sleepycat.je.utilint,
|
||||
com.sleepycat.persist,
|
||||
com.sleepycat.persist.evolve,
|
||||
com.sleepycat.persist.impl,
|
||||
com.sleepycat.persist.model,
|
||||
com.sleepycat.persist.raw,
|
||||
com.sleepycat.util,
|
||||
com.sleepycat.util.keyrange,
|
||||
de.l3s.boilerpipe,
|
||||
de.l3s.boilerpipe.conditions,
|
||||
de.l3s.boilerpipe.document,
|
||||
de.l3s.boilerpipe.estimators,
|
||||
de.l3s.boilerpipe.extractors,
|
||||
de.l3s.boilerpipe.filters.english,
|
||||
de.l3s.boilerpipe.filters.heuristics,
|
||||
de.l3s.boilerpipe.filters.simple,
|
||||
de.l3s.boilerpipe.labels,
|
||||
de.l3s.boilerpipe.sax,
|
||||
de.l3s.boilerpipe.util,
|
||||
edu.uci.ics.crawler4j.crawler,
|
||||
edu.uci.ics.crawler4j.fetcher,
|
||||
edu.uci.ics.crawler4j.frontier,
|
||||
edu.uci.ics.crawler4j.parser,
|
||||
edu.uci.ics.crawler4j.robotstxt,
|
||||
edu.uci.ics.crawler4j.url,
|
||||
edu.uci.ics.crawler4j.util,
|
||||
org.apache.james.mime4j,
|
||||
org.apache.james.mime4j.codec,
|
||||
org.apache.james.mime4j.dom,
|
||||
org.apache.james.mime4j.dom.address,
|
||||
org.apache.james.mime4j.dom.datetime,
|
||||
org.apache.james.mime4j.dom.field,
|
||||
org.apache.james.mime4j.field,
|
||||
org.apache.james.mime4j.field.address,
|
||||
org.apache.james.mime4j.field.contentdisposition.parser,
|
||||
org.apache.james.mime4j.field.contenttype.parser,
|
||||
org.apache.james.mime4j.field.datetime.parser,
|
||||
org.apache.james.mime4j.field.language.parser,
|
||||
org.apache.james.mime4j.field.mimeversion.parser,
|
||||
org.apache.james.mime4j.field.structured.parser,
|
||||
org.apache.james.mime4j.io,
|
||||
org.apache.james.mime4j.message,
|
||||
org.apache.james.mime4j.parser,
|
||||
org.apache.james.mime4j.stream,
|
||||
org.apache.james.mime4j.util,
|
||||
org.apache.tika,
|
||||
org.apache.tika.config,
|
||||
org.apache.tika.detect,
|
||||
org.apache.tika.exception,
|
||||
org.apache.tika.extractor,
|
||||
org.apache.tika.fork,
|
||||
org.apache.tika.io,
|
||||
org.apache.tika.language,
|
||||
org.apache.tika.metadata,
|
||||
org.apache.tika.mime,
|
||||
org.apache.tika.parser,
|
||||
org.apache.tika.parser.asm,
|
||||
org.apache.tika.parser.audio,
|
||||
org.apache.tika.parser.chm,
|
||||
org.apache.tika.parser.chm.accessor,
|
||||
org.apache.tika.parser.chm.assertion,
|
||||
org.apache.tika.parser.chm.core,
|
||||
org.apache.tika.parser.chm.exception,
|
||||
org.apache.tika.parser.chm.lzx,
|
||||
org.apache.tika.parser.dwg,
|
||||
org.apache.tika.parser.epub,
|
||||
org.apache.tika.parser.feed,
|
||||
org.apache.tika.parser.font,
|
||||
org.apache.tika.parser.hdf,
|
||||
org.apache.tika.parser.html,
|
||||
org.apache.tika.parser.image,
|
||||
org.apache.tika.parser.image.xmp,
|
||||
org.apache.tika.parser.internal,
|
||||
org.apache.tika.parser.iwork,
|
||||
org.apache.tika.parser.jpeg,
|
||||
org.apache.tika.parser.mail,
|
||||
org.apache.tika.parser.mbox,
|
||||
org.apache.tika.parser.microsoft,
|
||||
org.apache.tika.parser.microsoft.ooxml,
|
||||
org.apache.tika.parser.mp3,
|
||||
org.apache.tika.parser.netcdf,
|
||||
org.apache.tika.parser.odf,
|
||||
org.apache.tika.parser.opendocument,
|
||||
org.apache.tika.parser.pdf,
|
||||
org.apache.tika.parser.pkg,
|
||||
org.apache.tika.parser.prt,
|
||||
org.apache.tika.parser.rtf,
|
||||
org.apache.tika.parser.txt,
|
||||
org.apache.tika.parser.video,
|
||||
org.apache.tika.parser.xml,
|
||||
org.apache.tika.sax,
|
||||
org.apache.tika.sax.xpath,
|
||||
org.apache.tika.utils,
|
||||
org.ccil.cowan.tagsoup,
|
||||
org.ccil.cowan.tagsoup.jaxp,
|
||||
org.cyberneko.html,
|
||||
org.objectweb.asm,
|
||||
org.objectweb.asm.signature
|
||||
Import-Package: org.apache.log4j
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,14 +0,0 @@
|
|||
source.. = src/
|
||||
output.. = bin/
|
||||
bin.includes = META-INF/,\
|
||||
.,\
|
||||
apache-mime4j-core-0.7.jar,\
|
||||
apache-mime4j-dom-0.7.jar,\
|
||||
asm-3.1.jar,\
|
||||
boilerpipe-1.1.0.jar,\
|
||||
je-4.0.92.jar,\
|
||||
metadata-extractor-2.4.0-beta-1.jar,\
|
||||
tagsoup-1.2.1.jar,\
|
||||
tika-core-1.0.jar,\
|
||||
tika-parsers-1.0.jar,\
|
||||
resources/
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
|||
log4j.rootCategory=DEBUG, stdout
|
||||
|
||||
log4j.appender.stdout.Threshold=INFO
|
||||
|
||||
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||
|
||||
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] %m%n
|
||||
|
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -1,37 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.crawler;
|
||||
|
||||
/**
|
||||
* Several core components of crawler4j extend this class to make them
|
||||
* configurable.
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public abstract class Configurable {
|
||||
|
||||
protected CrawlConfig config;
|
||||
|
||||
protected Configurable(CrawlConfig config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public CrawlConfig getConfig() {
|
||||
return config;
|
||||
}
|
||||
}
|
|
@ -1,384 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.crawler;
|
||||
|
||||
public class CrawlConfig {
|
||||
|
||||
/**
|
||||
* The folder which will be used by crawler for storing the intermediate
|
||||
* crawl data. The content of this folder should not be modified manually.
|
||||
*/
|
||||
private String crawlStorageFolder;
|
||||
|
||||
/**
|
||||
* If this feature is enabled, you would be able to resume a previously
|
||||
* stopped/crashed crawl. However, it makes crawling slightly slower
|
||||
*/
|
||||
private boolean resumableCrawling = false;
|
||||
|
||||
/**
|
||||
* Maximum depth of crawling For unlimited depth this parameter should be
|
||||
* set to -1
|
||||
*/
|
||||
private int maxDepthOfCrawling = -1;
|
||||
|
||||
/**
|
||||
* Maximum number of pages to fetch For unlimited number of pages, this
|
||||
* parameter should be set to -1
|
||||
*/
|
||||
private int maxPagesToFetch = -1;
|
||||
|
||||
/**
|
||||
* user-agent string that is used for representing your crawler to web
|
||||
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
|
||||
*/
|
||||
private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";
|
||||
|
||||
/**
|
||||
* Politeness delay in milliseconds (delay between sending two requests to
|
||||
* the same host).
|
||||
*/
|
||||
private int politenessDelay = 200;
|
||||
|
||||
/**
|
||||
* Should we also crawl https pages?
|
||||
*/
|
||||
private boolean includeHttpsPages = false;
|
||||
|
||||
/**
|
||||
* Should we fetch binary content such as images, audio, ...?
|
||||
*/
|
||||
private boolean includeBinaryContentInCrawling = false;
|
||||
|
||||
/**
|
||||
* Maximum Connections per host
|
||||
*/
|
||||
private int maxConnectionsPerHost = 100;
|
||||
|
||||
/**
|
||||
* Maximum total connections
|
||||
*/
|
||||
private int maxTotalConnections = 100;
|
||||
|
||||
/**
|
||||
* Socket timeout in milliseconds
|
||||
*/
|
||||
private int socketTimeout = 20000;
|
||||
|
||||
/**
|
||||
* Connection timeout in milliseconds
|
||||
*/
|
||||
private int connectionTimeout = 30000;
|
||||
|
||||
/**
|
||||
* Max number of outgoing links which are processed from a page
|
||||
*/
|
||||
private int maxOutgoingLinksToFollow = 5000;
|
||||
|
||||
/**
|
||||
* Max allowed size of a page. Pages larger than this size will not be
|
||||
* fetched.
|
||||
*/
|
||||
private int maxDownloadSize = 1048576;
|
||||
|
||||
/**
|
||||
* Should we follow redirects?
|
||||
*/
|
||||
private boolean followRedirects = true;
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy, this parameter can be used for
|
||||
* specifying the proxy host.
|
||||
*/
|
||||
private String proxyHost = null;
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy, this parameter can be used for
|
||||
* specifying the proxy port.
|
||||
*/
|
||||
private int proxyPort = 80;
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy and user/pass is needed for
|
||||
* authentication in proxy, this parameter can be used for specifying the
|
||||
* username.
|
||||
*/
|
||||
private String proxyUsername = null;
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy and user/pass is needed for
|
||||
* authentication in proxy, this parameter can be used for specifying the
|
||||
* password.
|
||||
*/
|
||||
private String proxyPassword = null;
|
||||
|
||||
public CrawlConfig() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the configs specified by this instance.
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public void validate() throws Exception {
|
||||
if (crawlStorageFolder == null) {
|
||||
throw new Exception(
|
||||
"Crawl storage folder is not set in the CrawlConfig.");
|
||||
}
|
||||
if (politenessDelay < 0) {
|
||||
throw new Exception("Invalid value for politeness delay: "
|
||||
+ politenessDelay);
|
||||
}
|
||||
if (maxDepthOfCrawling < -1) {
|
||||
throw new Exception(
|
||||
"Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
|
||||
}
|
||||
if (maxDepthOfCrawling > Short.MAX_VALUE) {
|
||||
throw new Exception("Maximum value for crawl depth is "
|
||||
+ Short.MAX_VALUE);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public String getCrawlStorageFolder() {
|
||||
return crawlStorageFolder;
|
||||
}
|
||||
|
||||
/**
|
||||
* The folder which will be used by crawler for storing the intermediate
|
||||
* crawl data. The content of this folder should not be modified manually.
|
||||
*/
|
||||
public void setCrawlStorageFolder(String crawlStorageFolder) {
|
||||
this.crawlStorageFolder = crawlStorageFolder;
|
||||
}
|
||||
|
||||
public boolean isResumableCrawling() {
|
||||
return resumableCrawling;
|
||||
}
|
||||
|
||||
/**
|
||||
* If this feature is enabled, you would be able to resume a previously
|
||||
* stopped/crashed crawl. However, it makes crawling slightly slower
|
||||
*/
|
||||
public void setResumableCrawling(boolean resumableCrawling) {
|
||||
this.resumableCrawling = resumableCrawling;
|
||||
}
|
||||
|
||||
public int getMaxDepthOfCrawling() {
|
||||
return maxDepthOfCrawling;
|
||||
}
|
||||
|
||||
/**
|
||||
* Maximum depth of crawling For unlimited depth this parameter should be
|
||||
* set to -1
|
||||
*/
|
||||
public void setMaxDepthOfCrawling(int maxDepthOfCrawling) {
|
||||
this.maxDepthOfCrawling = maxDepthOfCrawling;
|
||||
}
|
||||
|
||||
public int getMaxPagesToFetch() {
|
||||
return maxPagesToFetch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Maximum number of pages to fetch For unlimited number of pages, this
|
||||
* parameter should be set to -1
|
||||
*/
|
||||
public void setMaxPagesToFetch(int maxPagesToFetch) {
|
||||
this.maxPagesToFetch = maxPagesToFetch;
|
||||
}
|
||||
|
||||
public String getUserAgentString() {
|
||||
return userAgentString;
|
||||
}
|
||||
|
||||
/**
|
||||
* user-agent string that is used for representing your crawler to web
|
||||
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
|
||||
*/
|
||||
public void setUserAgentString(String userAgentString) {
|
||||
this.userAgentString = userAgentString;
|
||||
}
|
||||
|
||||
public int getPolitenessDelay() {
|
||||
return politenessDelay;
|
||||
}
|
||||
|
||||
/**
|
||||
* Politeness delay in milliseconds (delay between sending two requests to
|
||||
* the same host).
|
||||
*
|
||||
* @param politenessDelay
|
||||
* the delay in milliseconds.
|
||||
*/
|
||||
public void setPolitenessDelay(int politenessDelay) {
|
||||
this.politenessDelay = politenessDelay;
|
||||
}
|
||||
|
||||
public boolean isIncludeHttpsPages() {
|
||||
return includeHttpsPages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we also crawl https pages?
|
||||
*/
|
||||
public void setIncludeHttpsPages(boolean includeHttpsPages) {
|
||||
this.includeHttpsPages = includeHttpsPages;
|
||||
}
|
||||
|
||||
public boolean isIncludeBinaryContentInCrawling() {
|
||||
return includeBinaryContentInCrawling;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we fetch binary content such as images, audio, ...?
|
||||
*/
|
||||
public void setIncludeBinaryContentInCrawling(
|
||||
boolean includeBinaryContentInCrawling) {
|
||||
this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
|
||||
}
|
||||
|
||||
public int getMaxConnectionsPerHost() {
|
||||
return maxConnectionsPerHost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Maximum Connections per host
|
||||
*/
|
||||
public void setMaxConnectionsPerHost(int maxConnectionsPerHost) {
|
||||
this.maxConnectionsPerHost = maxConnectionsPerHost;
|
||||
}
|
||||
|
||||
public int getMaxTotalConnections() {
|
||||
return maxTotalConnections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Maximum total connections
|
||||
*/
|
||||
public void setMaxTotalConnections(int maxTotalConnections) {
|
||||
this.maxTotalConnections = maxTotalConnections;
|
||||
}
|
||||
|
||||
public int getSocketTimeout() {
|
||||
return socketTimeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* Socket timeout in milliseconds
|
||||
*/
|
||||
public void setSocketTimeout(int socketTimeout) {
|
||||
this.socketTimeout = socketTimeout;
|
||||
}
|
||||
|
||||
public int getConnectionTimeout() {
|
||||
return connectionTimeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* Connection timeout in milliseconds
|
||||
*/
|
||||
public void setConnectionTimeout(int connectionTimeout) {
|
||||
this.connectionTimeout = connectionTimeout;
|
||||
}
|
||||
|
||||
public int getMaxOutgoingLinksToFollow() {
|
||||
return maxOutgoingLinksToFollow;
|
||||
}
|
||||
|
||||
/**
|
||||
* Max number of outgoing links which are processed from a page
|
||||
*/
|
||||
public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) {
|
||||
this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow;
|
||||
}
|
||||
|
||||
public int getMaxDownloadSize() {
|
||||
return maxDownloadSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Max allowed size of a page. Pages larger than this size will not be
|
||||
* fetched.
|
||||
*/
|
||||
public void setMaxDownloadSize(int maxDownloadSize) {
|
||||
this.maxDownloadSize = maxDownloadSize;
|
||||
}
|
||||
|
||||
public boolean isFollowRedirects() {
|
||||
return followRedirects;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we follow redirects?
|
||||
*/
|
||||
public void setFollowRedirects(boolean followRedirects) {
|
||||
this.followRedirects = followRedirects;
|
||||
}
|
||||
|
||||
public String getProxyHost() {
|
||||
return proxyHost;
|
||||
}
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy, this parameter can be used for
|
||||
* specifying the proxy host.
|
||||
*/
|
||||
public void setProxyHost(String proxyHost) {
|
||||
this.proxyHost = proxyHost;
|
||||
}
|
||||
|
||||
public int getProxyPort() {
|
||||
return proxyPort;
|
||||
}
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy, this parameter can be used for
|
||||
* specifying the proxy port.
|
||||
*/
|
||||
public void setProxyPort(int proxyPort) {
|
||||
this.proxyPort = proxyPort;
|
||||
}
|
||||
|
||||
public String getProxyUsername() {
|
||||
return proxyUsername;
|
||||
}
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy and user/pass is needed for
|
||||
* authentication in proxy, this parameter can be used for specifying the
|
||||
* username.
|
||||
*/
|
||||
public void setProxyUsername(String proxyUsername) {
|
||||
this.proxyUsername = proxyUsername;
|
||||
}
|
||||
|
||||
public String getProxyPassword() {
|
||||
return proxyPassword;
|
||||
}
|
||||
|
||||
/**
|
||||
* If crawler should run behind a proxy and user/pass is needed for
|
||||
* authentication in proxy, this parameter can be used for specifying the
|
||||
* password.
|
||||
*/
|
||||
public void setProxyPassword(String proxyPassword) {
|
||||
this.proxyPassword = proxyPassword;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,462 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.crawler;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.sleepycat.je.Environment;
|
||||
import com.sleepycat.je.EnvironmentConfig;
|
||||
|
||||
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||
import edu.uci.ics.crawler4j.frontier.DocIDServer;
|
||||
import edu.uci.ics.crawler4j.frontier.Frontier;
|
||||
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
import edu.uci.ics.crawler4j.util.IO;
|
||||
|
||||
/**
|
||||
* The controller that manages a crawling session. This class creates the
|
||||
* crawler threads and monitors their progress.
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class CrawlController extends Configurable {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(CrawlController.class
|
||||
.getName());
|
||||
|
||||
/**
|
||||
* The 'customData' object can be used for passing custom crawl-related
|
||||
* configurations to different components of the crawler.
|
||||
*/
|
||||
protected Object customData;
|
||||
|
||||
/**
|
||||
* Once the crawling session finishes the controller collects the local data
|
||||
* of the crawler threads and stores them in this List.
|
||||
*/
|
||||
protected List<Object> crawlersLocalData = new ArrayList<Object>();
|
||||
|
||||
/**
|
||||
* Is the crawling of this session finished?
|
||||
*/
|
||||
protected boolean finished;
|
||||
|
||||
/**
|
||||
* Is the crawling session set to 'shutdown'. Crawler threads monitor this
|
||||
* flag and when it is set they will no longer process new pages.
|
||||
*/
|
||||
protected boolean shuttingDown;
|
||||
|
||||
protected PageFetcher pageFetcher;
|
||||
|
||||
protected RobotstxtServer robotstxtServer;
|
||||
|
||||
protected Frontier frontier;
|
||||
|
||||
protected DocIDServer docIdServer;
|
||||
|
||||
protected final Object waitingLock = new Object();
|
||||
|
||||
public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
|
||||
RobotstxtServer robotstxtServer) throws Exception {
|
||||
super(config);
|
||||
|
||||
config.validate();
|
||||
File folder = new File(config.getCrawlStorageFolder());
|
||||
if (!folder.exists()) {
|
||||
if (!folder.mkdirs()) {
|
||||
throw new Exception("Couldn't create this folder: "
|
||||
+ folder.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
boolean resumable = config.isResumableCrawling();
|
||||
|
||||
EnvironmentConfig envConfig = new EnvironmentConfig();
|
||||
envConfig.setAllowCreate(true);
|
||||
envConfig.setTransactional(resumable);
|
||||
envConfig.setLocking(resumable);
|
||||
|
||||
File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
|
||||
if (!envHome.exists()) {
|
||||
if (!envHome.mkdir()) {
|
||||
throw new Exception("Couldn't create this folder: "
|
||||
+ envHome.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
if (!resumable) {
|
||||
IO.deleteFolderContents(envHome);
|
||||
}
|
||||
|
||||
Environment env = new Environment(envHome, envConfig);
|
||||
docIdServer = new DocIDServer(env, config);
|
||||
frontier = new Frontier(env, config, docIdServer);
|
||||
|
||||
this.pageFetcher = pageFetcher;
|
||||
this.robotstxtServer = robotstxtServer;
|
||||
|
||||
finished = false;
|
||||
shuttingDown = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the crawling session and wait for it to finish.
|
||||
*
|
||||
* @param _c
|
||||
* the class that implements the logic for crawler threads
|
||||
* @param numberOfCrawlers
|
||||
* the number of concurrent threads that will be contributing in
|
||||
* this crawling session.
|
||||
*/
|
||||
public <T extends WebCrawler> void start(final ArrayList<T> webCrawlers,
|
||||
final int numberOfCrawlers) {
|
||||
|
||||
this.start(webCrawlers, numberOfCrawlers, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the crawling session and return immediately.
|
||||
*
|
||||
* @param _c
|
||||
* the class that implements the logic for crawler threads
|
||||
* @param numberOfCrawlers
|
||||
* the number of concurrent threads that will be contributing in
|
||||
* this crawling session.
|
||||
*/
|
||||
public <T extends WebCrawler> void startNonBlocking(
|
||||
ArrayList<T> webCrawlers, final int numberOfCrawlers) {
|
||||
this.start(webCrawlers, numberOfCrawlers, false);
|
||||
}
|
||||
|
||||
protected <T extends WebCrawler> void start(final ArrayList<T> crawlers,
|
||||
final int numberOfCrawlers, boolean isBlocking) {
|
||||
try {
|
||||
finished = false;
|
||||
crawlersLocalData.clear();
|
||||
final List<Thread> threads = new ArrayList<Thread>();
|
||||
|
||||
for (int i = 1; i <= numberOfCrawlers; i++) {
|
||||
T crawler = crawlers.get(i - 1);
|
||||
Thread thread = new Thread(crawler, "Crawler " + i);
|
||||
crawler.setThread(thread);
|
||||
crawler.init(i, this);
|
||||
thread.start();
|
||||
threads.add(thread);
|
||||
// System.out.println("Crawler " + i + " started.");
|
||||
}
|
||||
|
||||
final CrawlController controller = this;
|
||||
|
||||
Thread monitorThread = new Thread(new Runnable() {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
synchronized (waitingLock) {
|
||||
|
||||
while (true) {
|
||||
sleep();
|
||||
boolean someoneIsWorking = false;
|
||||
for (int i = 0; i < threads.size(); i++) {
|
||||
Thread thread = threads.get(i);
|
||||
if (!thread.isAlive()) {
|
||||
if (!shuttingDown) {
|
||||
logger.info("Thread " + i
|
||||
+ " has died.");
|
||||
// T crawler = _c.newInstance();
|
||||
// thread = new Thread(crawler,
|
||||
// "Crawler " + (i + 1));
|
||||
threads.remove(i);
|
||||
// threads.add(i, thread);
|
||||
// crawler.setThread(thread);
|
||||
// crawler.init(i + 1, controller);
|
||||
// thread.start();
|
||||
crawlers.remove(i);
|
||||
// crawlers.add(i, crawler);
|
||||
}
|
||||
} else if (crawlers.get(i)
|
||||
.isNotWaitingForNewURLs()) {
|
||||
someoneIsWorking = true;
|
||||
}
|
||||
}
|
||||
if (!someoneIsWorking) {
|
||||
// Make sure again that none of the threads
|
||||
// are
|
||||
// alive.
|
||||
// System.out
|
||||
// .println("It looks like no threads are working...");
|
||||
someoneIsWorking = false;
|
||||
for (int i = 0; i < threads.size(); i++) {
|
||||
Thread thread = threads.get(i);
|
||||
if (thread.isAlive()
|
||||
&& crawlers
|
||||
.get(i)
|
||||
.isNotWaitingForNewURLs()) {
|
||||
someoneIsWorking = true;
|
||||
}
|
||||
}
|
||||
if (!someoneIsWorking) {
|
||||
if (!shuttingDown) {
|
||||
long queueLength = frontier
|
||||
.getQueueLength();
|
||||
if (queueLength > 0) {
|
||||
continue;
|
||||
}
|
||||
// System.out
|
||||
// .println("No thread is working and no more URLs are in queue ...");
|
||||
queueLength = frontier
|
||||
.getQueueLength();
|
||||
if (queueLength > 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("All of the crawlers are stopped. Finishing the process...");
|
||||
// At this step, frontier notifies the
|
||||
// threads that were
|
||||
// waiting for new URLs and they should
|
||||
// stop
|
||||
frontier.finish();
|
||||
for (T crawler : crawlers) {
|
||||
crawler.onBeforeExit();
|
||||
crawlersLocalData.add(crawler
|
||||
.getMyLocalData());
|
||||
}
|
||||
|
||||
// System.out.println("Final clean up...");
|
||||
|
||||
frontier.close();
|
||||
docIdServer.close();
|
||||
pageFetcher.shutDown();
|
||||
|
||||
finished = true;
|
||||
waitingLock.notifyAll();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
monitorThread.start();
|
||||
|
||||
if (isBlocking) {
|
||||
waitUntilFinish();
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait until this crawling session finishes.
|
||||
*/
|
||||
public void waitUntilFinish() {
|
||||
while (!finished) {
|
||||
synchronized (waitingLock) {
|
||||
if (finished) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
waitingLock.wait();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Once the crawling session finishes the controller collects the local data
|
||||
* of the crawler threads and stores them in a List. This function returns
|
||||
* the reference to this list.
|
||||
*/
|
||||
public List<Object> getCrawlersLocalData() {
|
||||
return crawlersLocalData;
|
||||
}
|
||||
|
||||
protected void sleep() {
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
// System.out.println("Sleeping!!!!!!");
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
|
||||
* to extract new URLs in it and follow them for crawling.
|
||||
*
|
||||
* @param pageUrl
|
||||
* the URL of the seed
|
||||
*/
|
||||
public void addSeed(String pageUrl) {
|
||||
addSeed(pageUrl, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
|
||||
* to extract new URLs in it and follow them for crawling. You can also
|
||||
* specify a specific document id to be assigned to this seed URL. This
|
||||
* document id needs to be unique. Also, note that if you add three seeds
|
||||
* with document ids 1,2, and 7. Then the next URL that is found during the
|
||||
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
|
||||
* increasing order of document ids.
|
||||
*
|
||||
* Specifying doc ids is mainly useful when you have had a previous crawl
|
||||
* and have stored the results and want to start a new crawl with seeds
|
||||
* which get the same document ids as the previous crawl.
|
||||
*
|
||||
* @param pageUrl
|
||||
* the URL of the seed
|
||||
* @param docId
|
||||
* the document id that you want to be assigned to this seed URL.
|
||||
*
|
||||
*/
|
||||
public void addSeed(String pageUrl, int docId) {
|
||||
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
|
||||
if (canonicalUrl == null) {
|
||||
logger.error("Invalid seed URL: " + pageUrl);
|
||||
return;
|
||||
}
|
||||
if (docId < 0) {
|
||||
docId = docIdServer.getDocId(canonicalUrl);
|
||||
if (docId > 0) {
|
||||
// This URL is already seen.
|
||||
return;
|
||||
}
|
||||
docId = docIdServer.getNewDocID(canonicalUrl);
|
||||
} else {
|
||||
try {
|
||||
docIdServer.addUrlAndDocId(canonicalUrl, docId);
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not add seed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
WebURL webUrl = new WebURL();
|
||||
webUrl.setURL(canonicalUrl);
|
||||
webUrl.setDocid(docId);
|
||||
webUrl.setDepth((short) 0);
|
||||
if (!robotstxtServer.allows(webUrl)) {
|
||||
logger.info("Robots.txt does not allow this seed: " + pageUrl);
|
||||
} else {
|
||||
frontier.schedule(webUrl);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function can called to assign a specific document id to a url. This
|
||||
* feature is useful when you have had a previous crawl and have stored the
|
||||
* Urls and their associated document ids and want to have a new crawl which
|
||||
* is aware of the previously seen Urls and won't re-crawl them.
|
||||
*
|
||||
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
|
||||
* the next URL that is found during the crawl will get a doc id of 8. Also
|
||||
* you need to ensure to add seen Urls in increasing order of document ids.
|
||||
*
|
||||
* @param pageUrl
|
||||
* the URL of the page
|
||||
* @param docId
|
||||
* the document id that you want to be assigned to this URL.
|
||||
*
|
||||
*/
|
||||
public void addSeenUrl(String url, int docId) {
|
||||
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
|
||||
if (canonicalUrl == null) {
|
||||
logger.error("Invalid Url: " + url);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
docIdServer.addUrlAndDocId(canonicalUrl, docId);
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not add seen url: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public PageFetcher getPageFetcher() {
|
||||
return pageFetcher;
|
||||
}
|
||||
|
||||
public void setPageFetcher(PageFetcher pageFetcher) {
|
||||
this.pageFetcher = pageFetcher;
|
||||
}
|
||||
|
||||
public RobotstxtServer getRobotstxtServer() {
|
||||
return robotstxtServer;
|
||||
}
|
||||
|
||||
public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
|
||||
this.robotstxtServer = robotstxtServer;
|
||||
}
|
||||
|
||||
public Frontier getFrontier() {
|
||||
return frontier;
|
||||
}
|
||||
|
||||
public void setFrontier(Frontier frontier) {
|
||||
this.frontier = frontier;
|
||||
}
|
||||
|
||||
public DocIDServer getDocIdServer() {
|
||||
return docIdServer;
|
||||
}
|
||||
|
||||
public void setDocIdServer(DocIDServer docIdServer) {
|
||||
this.docIdServer = docIdServer;
|
||||
}
|
||||
|
||||
public Object getCustomData() {
|
||||
return customData;
|
||||
}
|
||||
|
||||
public void setCustomData(Object customData) {
|
||||
this.customData = customData;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return this.finished;
|
||||
}
|
||||
|
||||
public boolean isShuttingDown() {
|
||||
return shuttingDown;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the current crawling session set to 'shutdown'. Crawler threads
|
||||
* monitor the shutdown flag and when it is set to true, they will no longer
|
||||
* process new pages.
|
||||
*/
|
||||
public void Shutdown() {
|
||||
logger.info("Shutting down...");
|
||||
this.shuttingDown = true;
|
||||
frontier.finish();
|
||||
}
|
||||
}
|
|
@ -1,155 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.crawler;
|
||||
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HttpEntity;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
|
||||
import edu.uci.ics.crawler4j.parser.ParseData;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
/**
|
||||
* This class contains the data for a fetched and parsed page.
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class Page {
|
||||
|
||||
/**
|
||||
* The URL of this page.
|
||||
*/
|
||||
protected WebURL url;
|
||||
|
||||
/**
|
||||
* The content of this page in binary format.
|
||||
*/
|
||||
protected byte[] contentData;
|
||||
|
||||
/**
|
||||
* The ContentType of this page. For example: "text/html; charset=UTF-8"
|
||||
*/
|
||||
protected String contentType;
|
||||
|
||||
/**
|
||||
* The encoding of the content. For example: "gzip"
|
||||
*/
|
||||
protected String contentEncoding;
|
||||
|
||||
/**
|
||||
* The charset of the content. For example: "UTF-8"
|
||||
*/
|
||||
protected String contentCharset;
|
||||
|
||||
/**
|
||||
* The parsed data populated by parsers
|
||||
*/
|
||||
protected ParseData parseData;
|
||||
|
||||
public Page(WebURL url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public WebURL getWebURL() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setWebURL(WebURL url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the content of this page from a fetched HttpEntity.
|
||||
*/
|
||||
public void load(HttpEntity entity) throws Exception {
|
||||
|
||||
contentType = null;
|
||||
Header type = entity.getContentType();
|
||||
if (type != null) {
|
||||
contentType = type.getValue();
|
||||
}
|
||||
|
||||
contentEncoding = null;
|
||||
Header encoding = entity.getContentEncoding();
|
||||
if (encoding != null) {
|
||||
contentEncoding = encoding.getValue();
|
||||
}
|
||||
|
||||
contentCharset = EntityUtils.getContentCharSet(entity);
|
||||
|
||||
contentData = EntityUtils.toByteArray(entity);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the parsed data generated for this page by parsers
|
||||
*/
|
||||
public ParseData getParseData() {
|
||||
return parseData;
|
||||
}
|
||||
|
||||
public void setParseData(ParseData parseData) {
|
||||
this.parseData = parseData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the content of this page in binary format.
|
||||
*/
|
||||
public byte[] getContentData() {
|
||||
return contentData;
|
||||
}
|
||||
|
||||
public void setContentData(byte[] contentData) {
|
||||
this.contentData = contentData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the ContentType of this page. For example:
|
||||
* "text/html; charset=UTF-8"
|
||||
*/
|
||||
public String getContentType() {
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public void setContentType(String contentType) {
|
||||
this.contentType = contentType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the encoding of the content. For example: "gzip"
|
||||
*/
|
||||
public String getContentEncoding() {
|
||||
return contentEncoding;
|
||||
}
|
||||
|
||||
public void setContentEncoding(String contentEncoding) {
|
||||
this.contentEncoding = contentEncoding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the charset of the content. For example: "UTF-8"
|
||||
*/
|
||||
public String getContentCharset() {
|
||||
return contentCharset;
|
||||
}
|
||||
|
||||
public void setContentCharset(String contentCharset) {
|
||||
this.contentCharset = contentCharset;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,347 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.crawler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
|
||||
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
|
||||
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||
import edu.uci.ics.crawler4j.frontier.DocIDServer;
|
||||
import edu.uci.ics.crawler4j.frontier.Frontier;
|
||||
import edu.uci.ics.crawler4j.parser.HtmlParseData;
|
||||
import edu.uci.ics.crawler4j.parser.ParseData;
|
||||
import edu.uci.ics.crawler4j.parser.Parser;
|
||||
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
/**
|
||||
* WebCrawler class in the Runnable class that is executed by each crawler
|
||||
* thread.
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class WebCrawler implements Runnable {
|
||||
|
||||
protected static final Logger logger = Logger.getLogger(WebCrawler.class
|
||||
.getName());
|
||||
|
||||
/**
|
||||
* The id associated to the crawler thread running this instance
|
||||
*/
|
||||
protected int myId;
|
||||
|
||||
/**
|
||||
* The controller instance that has created this crawler thread. This
|
||||
* reference to the controller can be used for getting configurations of the
|
||||
* current crawl or adding new seeds during runtime.
|
||||
*/
|
||||
protected CrawlController myController;
|
||||
|
||||
/**
|
||||
* The thread within which this crawler instance is running.
|
||||
*/
|
||||
private Thread myThread;
|
||||
|
||||
/**
|
||||
* The parser that is used by this crawler instance to parse the content of
|
||||
* the fetched pages.
|
||||
*/
|
||||
private Parser parser;
|
||||
|
||||
/**
|
||||
* The fetcher that is used by this crawler instance to fetch the content of
|
||||
* pages from the web.
|
||||
*/
|
||||
private PageFetcher pageFetcher;
|
||||
|
||||
/**
|
||||
* The RobotstxtServer instance that is used by this crawler instance to
|
||||
* determine whether the crawler is allowed to crawl the content of each
|
||||
* page.
|
||||
*/
|
||||
private RobotstxtServer robotstxtServer;
|
||||
|
||||
/**
|
||||
* The DocIDServer that is used by this crawler instance to map each URL to
|
||||
* a unique docid.
|
||||
*/
|
||||
private DocIDServer docIdServer;
|
||||
|
||||
/**
|
||||
* The Frontier object that manages the crawl queue.
|
||||
*/
|
||||
private Frontier frontier;
|
||||
|
||||
/**
|
||||
* Is the current crawler instance waiting for new URLs? This field is
|
||||
* mainly used by the controller to detect whether all of the crawler
|
||||
* instances are waiting for new URLs and therefore there is no more work
|
||||
* and crawling can be stopped.
|
||||
*/
|
||||
private boolean isWaitingForNewURLs;
|
||||
|
||||
public CrawlController getMyController() {
|
||||
return myController;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the id of the current crawler instance
|
||||
*
|
||||
* @return the id of the current crawler instance
|
||||
*/
|
||||
public int getMyId() {
|
||||
return myId;
|
||||
}
|
||||
|
||||
/**
|
||||
* The CrawlController instance that has created this crawler instance will
|
||||
* call this function just before terminating this crawler thread. Classes
|
||||
* that extend WebCrawler can override this function to pass their local
|
||||
* data to their controller. The controller then puts these local data in a
|
||||
* List that can then be used for processing the local data of crawlers (if
|
||||
* needed).
|
||||
*/
|
||||
public Object getMyLocalData() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Thread getThread() {
|
||||
return myThread;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is called once the header of a page is fetched. It can be
|
||||
* overwritten by sub-classes to perform custom logic for different status
|
||||
* codes. For example, 404 pages can be logged, etc.
|
||||
*/
|
||||
protected void handlePageStatusCode(WebURL webUrl, int statusCode,
|
||||
String statusDescription) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the current instance of the crawler
|
||||
*
|
||||
* @param myId
|
||||
* the id of this crawler instance
|
||||
* @param crawlController
|
||||
* the controller that manages this crawling session
|
||||
*/
|
||||
public void init(int myId, CrawlController crawlController) {
|
||||
this.myId = myId;
|
||||
pageFetcher = crawlController.getPageFetcher();
|
||||
robotstxtServer = crawlController.getRobotstxtServer();
|
||||
docIdServer = crawlController.getDocIdServer();
|
||||
frontier = crawlController.getFrontier();
|
||||
parser = new Parser(crawlController.getConfig());
|
||||
myController = crawlController;
|
||||
isWaitingForNewURLs = false;
|
||||
}
|
||||
|
||||
public boolean isNotWaitingForNewURLs() {
|
||||
return !isWaitingForNewURLs;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is called just before the termination of the current
|
||||
* crawler instance. It can be used for persisting in-memory data or other
|
||||
* finalization tasks.
|
||||
*/
|
||||
public void onBeforeExit() {
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is called just before starting the crawl by this crawler
|
||||
* instance. It can be used for setting up the data structures or
|
||||
* initializations needed by this crawler instance.
|
||||
*/
|
||||
public void onStart() {
|
||||
}
|
||||
|
||||
private void processPage(WebURL curURL) {
|
||||
if (curURL == null) {
|
||||
return;
|
||||
}
|
||||
PageFetchResult fetchResult = null;
|
||||
try {
|
||||
fetchResult = pageFetcher.fetchHeader(curURL);
|
||||
int statusCode = fetchResult.getStatusCode();
|
||||
handlePageStatusCode(curURL, statusCode,
|
||||
CustomFetchStatus.getStatusDescription(statusCode));
|
||||
if (statusCode != HttpStatus.SC_OK) {
|
||||
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
|
||||
|| statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
|
||||
if (myController.getConfig().isFollowRedirects()) {
|
||||
String movedToUrl = fetchResult.getMovedToUrl();
|
||||
if (movedToUrl == null) {
|
||||
return;
|
||||
}
|
||||
int newDocId = docIdServer.getDocId(movedToUrl);
|
||||
if (newDocId > 0) {
|
||||
// Redirect page is already seen
|
||||
return;
|
||||
} else {
|
||||
WebURL webURL = new WebURL();
|
||||
webURL.setURL(movedToUrl);
|
||||
webURL.setParentDocid(curURL.getParentDocid());
|
||||
webURL.setParentUrl(curURL.getParentUrl());
|
||||
webURL.setDepth(curURL.getDepth());
|
||||
webURL.setDocid(-1);
|
||||
if (shouldVisit(webURL)
|
||||
&& robotstxtServer.allows(webURL)) {
|
||||
webURL.setDocid(docIdServer
|
||||
.getNewDocID(movedToUrl));
|
||||
frontier.schedule(webURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
|
||||
logger.info("Skipping a page which was bigger than max allowed size: "
|
||||
+ curURL.getURL());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
|
||||
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
|
||||
// Redirect page is already seen
|
||||
return;
|
||||
}
|
||||
curURL.setURL(fetchResult.getFetchedUrl());
|
||||
curURL.setDocid(docIdServer.getNewDocID(fetchResult
|
||||
.getFetchedUrl()));
|
||||
}
|
||||
|
||||
Page page = new Page(curURL);
|
||||
int docid = curURL.getDocid();
|
||||
if (fetchResult.fetchContent(page)
|
||||
&& parser.parse(page, curURL.getURL())) {
|
||||
ParseData parseData = page.getParseData();
|
||||
if (parseData instanceof HtmlParseData) {
|
||||
HtmlParseData htmlParseData = (HtmlParseData) parseData;
|
||||
|
||||
List<WebURL> toSchedule = new ArrayList<WebURL>();
|
||||
int maxCrawlDepth = myController.getConfig()
|
||||
.getMaxDepthOfCrawling();
|
||||
for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
|
||||
webURL.setParentDocid(docid);
|
||||
webURL.setParentUrl(curURL.getURL());
|
||||
int newdocid = docIdServer.getDocId(webURL.getURL());
|
||||
if (newdocid > 0) {
|
||||
// This is not the first time that this Url is
|
||||
// visited. So, we set the depth to a negative
|
||||
// number.
|
||||
webURL.setDepth((short) -1);
|
||||
webURL.setDocid(newdocid);
|
||||
} else {
|
||||
webURL.setDocid(-1);
|
||||
webURL.setDepth((short) (curURL.getDepth() + 1));
|
||||
if (maxCrawlDepth == -1
|
||||
|| curURL.getDepth() < maxCrawlDepth) {
|
||||
if (shouldVisit(webURL)
|
||||
&& robotstxtServer.allows(webURL)) {
|
||||
webURL.setDocid(docIdServer
|
||||
.getNewDocID(webURL.getURL()));
|
||||
toSchedule.add(webURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
frontier.scheduleAll(toSchedule);
|
||||
}
|
||||
visit(page);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
logger.error(e.getMessage() + ", while processing: "
|
||||
+ curURL.getURL());
|
||||
} finally {
|
||||
if (fetchResult != null) {
|
||||
fetchResult.discardContentIfNotConsumed();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
onStart();
|
||||
while (true) {
|
||||
List<WebURL> assignedURLs = new ArrayList<WebURL>(50);
|
||||
isWaitingForNewURLs = true;
|
||||
frontier.getNextURLs(50, assignedURLs);
|
||||
isWaitingForNewURLs = false;
|
||||
logger.info("assignedURLs size = " + assignedURLs.size());
|
||||
if (assignedURLs.size() == 0) {
|
||||
if (frontier.isFinished()) {
|
||||
logger.info("Exiting because frontier is finished.");
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else {
|
||||
for (WebURL curURL : assignedURLs) {
|
||||
if (curURL != null) {
|
||||
processPage(curURL);
|
||||
frontier.setProcessed(curURL);
|
||||
}
|
||||
if (myController.isShuttingDown()) {
|
||||
logger.info("Exiting because of controller shutdown.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void setThread(Thread myThread) {
|
||||
this.myThread = myThread;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classes that extends WebCrawler can overwrite this function to tell the
|
||||
* crawler whether the given url should be crawled or not. The following
|
||||
* implementation indicates that all urls should be included in the crawl.
|
||||
*
|
||||
* @param url
|
||||
* the url which we are interested to know whether it should be
|
||||
* included in the crawl or not.
|
||||
* @return if the url should be included in the crawl it returns true,
|
||||
* otherwise false is returned.
|
||||
*/
|
||||
public boolean shouldVisit(WebURL url) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classes that extends WebCrawler can overwrite this function to process
|
||||
* the content of the fetched and parsed page.
|
||||
*
|
||||
* @param page
|
||||
* the page object that is just fetched and parsed.
|
||||
*/
|
||||
public void visit(Page page) {
|
||||
}
|
||||
|
||||
}
|
|
@ -1,106 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"; you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.fetcher;
|
||||
|
||||
import org.apache.http.HttpStatus;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class CustomFetchStatus {
|
||||
|
||||
public static final int PageTooBig = 1001;
|
||||
|
||||
public static final int FatalTransportError = 1005;
|
||||
|
||||
public static final int UnknownError = 1006;
|
||||
|
||||
public static String getStatusDescription(int code) {
|
||||
switch (code) {
|
||||
case HttpStatus.SC_OK:
|
||||
return "OK";
|
||||
case HttpStatus.SC_CREATED:
|
||||
return "Created";
|
||||
case HttpStatus.SC_ACCEPTED:
|
||||
return "Accepted";
|
||||
case HttpStatus.SC_NO_CONTENT:
|
||||
return "No Content";
|
||||
case HttpStatus.SC_MOVED_PERMANENTLY:
|
||||
return "Moved Permanently";
|
||||
case HttpStatus.SC_MOVED_TEMPORARILY:
|
||||
return "Moved Temporarily";
|
||||
case HttpStatus.SC_NOT_MODIFIED:
|
||||
return "Not Modified";
|
||||
case HttpStatus.SC_BAD_REQUEST:
|
||||
return "Bad Request";
|
||||
case HttpStatus.SC_UNAUTHORIZED:
|
||||
return "Unauthorized";
|
||||
case HttpStatus.SC_FORBIDDEN:
|
||||
return "Forbidden";
|
||||
case HttpStatus.SC_NOT_FOUND:
|
||||
return "Not Found";
|
||||
case HttpStatus.SC_INTERNAL_SERVER_ERROR:
|
||||
return "Internal Server Error";
|
||||
case HttpStatus.SC_NOT_IMPLEMENTED:
|
||||
return "Not Implemented";
|
||||
case HttpStatus.SC_BAD_GATEWAY:
|
||||
return "Bad Gateway";
|
||||
case HttpStatus.SC_SERVICE_UNAVAILABLE:
|
||||
return "Service Unavailable";
|
||||
case HttpStatus.SC_CONTINUE:
|
||||
return "Continue";
|
||||
case HttpStatus.SC_TEMPORARY_REDIRECT:
|
||||
return "Temporary Redirect";
|
||||
case HttpStatus.SC_METHOD_NOT_ALLOWED:
|
||||
return "Method Not Allowed";
|
||||
case HttpStatus.SC_CONFLICT:
|
||||
return "Conflict";
|
||||
case HttpStatus.SC_PRECONDITION_FAILED:
|
||||
return "Precondition Failed";
|
||||
case HttpStatus.SC_REQUEST_TOO_LONG:
|
||||
return "Request Too Long";
|
||||
case HttpStatus.SC_REQUEST_URI_TOO_LONG:
|
||||
return "Request-URI Too Long";
|
||||
case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE:
|
||||
return "Unsupported Media Type";
|
||||
case HttpStatus.SC_MULTIPLE_CHOICES:
|
||||
return "Multiple Choices";
|
||||
case HttpStatus.SC_SEE_OTHER:
|
||||
return "See Other";
|
||||
case HttpStatus.SC_USE_PROXY:
|
||||
return "Use Proxy";
|
||||
case HttpStatus.SC_PAYMENT_REQUIRED:
|
||||
return "Payment Required";
|
||||
case HttpStatus.SC_NOT_ACCEPTABLE:
|
||||
return "Not Acceptable";
|
||||
case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
|
||||
return "Proxy Authentication Required";
|
||||
case HttpStatus.SC_REQUEST_TIMEOUT:
|
||||
return "Request Timeout";
|
||||
case PageTooBig:
|
||||
return "Page size was too big";
|
||||
case FatalTransportError:
|
||||
return "Fatal transport error";
|
||||
case UnknownError:
|
||||
return "Unknown error";
|
||||
default:
|
||||
return "(" + code + ")";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.fetcher;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
|
||||
|
||||
public class IdleConnectionMonitorThread extends Thread {
|
||||
|
||||
private final ThreadSafeClientConnManager connMgr;
|
||||
|
||||
private volatile boolean shutdown;
|
||||
|
||||
public IdleConnectionMonitorThread(ThreadSafeClientConnManager connMgr) {
|
||||
super("Connection Manager");
|
||||
this.connMgr = connMgr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
while (!shutdown) {
|
||||
synchronized (this) {
|
||||
wait(5000);
|
||||
// Close expired connections
|
||||
connMgr.closeExpiredConnections();
|
||||
// Optionally, close connections
|
||||
// that have been idle longer than 30 sec
|
||||
connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException ex) {
|
||||
// terminate
|
||||
}
|
||||
}
|
||||
|
||||
public void shutdown() {
|
||||
shutdown = true;
|
||||
synchronized (this) {
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,105 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.fetcher;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.http.HttpEntity;
|
||||
import org.apache.http.util.EntityUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Page;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class PageFetchResult {
|
||||
|
||||
protected static final Logger logger = Logger
|
||||
.getLogger(PageFetchResult.class);
|
||||
|
||||
protected int statusCode;
|
||||
|
||||
protected HttpEntity entity = null;
|
||||
|
||||
protected String fetchedUrl = null;
|
||||
|
||||
protected String movedToUrl = null;
|
||||
|
||||
public int getStatusCode() {
|
||||
return statusCode;
|
||||
}
|
||||
|
||||
public void setStatusCode(int statusCode) {
|
||||
this.statusCode = statusCode;
|
||||
}
|
||||
|
||||
public HttpEntity getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public void setEntity(HttpEntity entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
|
||||
public String getFetchedUrl() {
|
||||
return fetchedUrl;
|
||||
}
|
||||
|
||||
public void setFetchedUrl(String fetchedUrl) {
|
||||
this.fetchedUrl = fetchedUrl;
|
||||
}
|
||||
|
||||
public boolean fetchContent(Page page) {
|
||||
try {
|
||||
page.load(entity);
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
logger.info("Exception while fetching content for: "
|
||||
+ page.getWebURL().getURL() + " [" + e.getMessage() + "]");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void discardContentIfNotConsumed() {
|
||||
try {
|
||||
if (entity != null) {
|
||||
EntityUtils.consume(entity);
|
||||
}
|
||||
} catch (EOFException e) {
|
||||
// We can ignore this exception. It can happen on compressed streams
|
||||
// which are not
|
||||
// repeatable
|
||||
} catch (IOException e) {
|
||||
// We can ignore this exception. It can happen if the stream is
|
||||
// closed.
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public String getMovedToUrl() {
|
||||
return movedToUrl;
|
||||
}
|
||||
|
||||
public void setMovedToUrl(String movedToUrl) {
|
||||
this.movedToUrl = movedToUrl;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,294 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.fetcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Date;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HeaderElement;
|
||||
import org.apache.http.HttpEntity;
|
||||
import org.apache.http.HttpException;
|
||||
import org.apache.http.HttpHost;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.HttpResponseInterceptor;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.apache.http.HttpVersion;
|
||||
import org.apache.http.auth.AuthScope;
|
||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||
import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.conn.params.ConnRoutePNames;
|
||||
import org.apache.http.conn.scheme.PlainSocketFactory;
|
||||
import org.apache.http.conn.scheme.Scheme;
|
||||
import org.apache.http.conn.scheme.SchemeRegistry;
|
||||
import org.apache.http.conn.ssl.SSLSocketFactory;
|
||||
import org.apache.http.entity.HttpEntityWrapper;
|
||||
import org.apache.http.impl.client.DefaultHttpClient;
|
||||
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
|
||||
import org.apache.http.params.BasicHttpParams;
|
||||
import org.apache.http.params.CoreConnectionPNames;
|
||||
import org.apache.http.params.CoreProtocolPNames;
|
||||
import org.apache.http.params.HttpParams;
|
||||
import org.apache.http.params.HttpProtocolParamBean;
|
||||
import org.apache.http.protocol.HttpContext;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Configurable;
|
||||
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class PageFetcher extends Configurable {
|
||||
|
||||
protected static final Logger logger = Logger.getLogger(PageFetcher.class);
|
||||
|
||||
protected ThreadSafeClientConnManager connectionManager;
|
||||
|
||||
protected DefaultHttpClient httpClient;
|
||||
|
||||
protected final Object mutex = new Object();
|
||||
|
||||
protected long lastFetchTime = 0;
|
||||
|
||||
protected IdleConnectionMonitorThread connectionMonitorThread = null;
|
||||
|
||||
public PageFetcher(CrawlConfig config) {
|
||||
super(config);
|
||||
|
||||
HttpParams params = new BasicHttpParams();
|
||||
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
|
||||
paramsBean.setVersion(HttpVersion.HTTP_1_1);
|
||||
paramsBean.setContentCharset("UTF-8");
|
||||
paramsBean.setUseExpectContinue(false);
|
||||
|
||||
params.setParameter(CoreProtocolPNames.USER_AGENT,
|
||||
config.getUserAgentString());
|
||||
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT,
|
||||
config.getSocketTimeout());
|
||||
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,
|
||||
config.getConnectionTimeout());
|
||||
|
||||
params.setBooleanParameter("http.protocol.handle-redirects", false);
|
||||
|
||||
SchemeRegistry schemeRegistry = new SchemeRegistry();
|
||||
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory
|
||||
.getSocketFactory()));
|
||||
|
||||
if (config.isIncludeHttpsPages()) {
|
||||
schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory
|
||||
.getSocketFactory()));
|
||||
}
|
||||
|
||||
connectionManager = new ThreadSafeClientConnManager(schemeRegistry);
|
||||
connectionManager.setMaxTotal(config.getMaxTotalConnections());
|
||||
connectionManager.setDefaultMaxPerRoute(config
|
||||
.getMaxConnectionsPerHost());
|
||||
httpClient = new DefaultHttpClient(connectionManager, params);
|
||||
|
||||
if (config.getProxyHost() != null) {
|
||||
|
||||
if (config.getProxyUsername() != null) {
|
||||
httpClient.getCredentialsProvider()
|
||||
.setCredentials(
|
||||
new AuthScope(config.getProxyHost(),
|
||||
config.getProxyPort()),
|
||||
new UsernamePasswordCredentials(config
|
||||
.getProxyUsername(), config
|
||||
.getProxyPassword()));
|
||||
}
|
||||
|
||||
HttpHost proxy = new HttpHost(config.getProxyHost(),
|
||||
config.getProxyPort());
|
||||
httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
|
||||
proxy);
|
||||
}
|
||||
|
||||
httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
|
||||
|
||||
@Override
|
||||
public void process(final HttpResponse response,
|
||||
final HttpContext context) throws HttpException,
|
||||
IOException {
|
||||
HttpEntity entity = response.getEntity();
|
||||
Header contentEncoding = entity.getContentEncoding();
|
||||
if (contentEncoding != null) {
|
||||
HeaderElement[] codecs = contentEncoding.getElements();
|
||||
for (HeaderElement codec : codecs) {
|
||||
if (codec.getName().equalsIgnoreCase("gzip")) {
|
||||
response.setEntity(new GzipDecompressingEntity(
|
||||
response.getEntity()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
if (connectionMonitorThread == null) {
|
||||
connectionMonitorThread = new IdleConnectionMonitorThread(
|
||||
connectionManager);
|
||||
}
|
||||
connectionMonitorThread.start();
|
||||
|
||||
}
|
||||
|
||||
public PageFetchResult fetchHeader(WebURL webUrl) {
|
||||
PageFetchResult fetchResult = new PageFetchResult();
|
||||
String toFetchURL = webUrl.getURL();
|
||||
HttpGet get = null;
|
||||
try {
|
||||
get = new HttpGet(toFetchURL);
|
||||
synchronized (mutex) {
|
||||
long now = (new Date()).getTime();
|
||||
if (now - lastFetchTime < config.getPolitenessDelay()) {
|
||||
Thread.sleep(config.getPolitenessDelay()
|
||||
- (now - lastFetchTime));
|
||||
}
|
||||
lastFetchTime = (new Date()).getTime();
|
||||
}
|
||||
get.addHeader("Accept-Encoding", "gzip");
|
||||
HttpResponse response = httpClient.execute(get);
|
||||
fetchResult.setEntity(response.getEntity());
|
||||
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
if (statusCode != HttpStatus.SC_OK) {
|
||||
if (statusCode != HttpStatus.SC_NOT_FOUND) {
|
||||
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
|
||||
|| statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
|
||||
Header header = response.getFirstHeader("Location");
|
||||
if (header != null) {
|
||||
String movedToUrl = header.getValue();
|
||||
movedToUrl = URLCanonicalizer.getCanonicalURL(
|
||||
movedToUrl, toFetchURL);
|
||||
fetchResult.setMovedToUrl(movedToUrl);
|
||||
}
|
||||
fetchResult.setStatusCode(statusCode);
|
||||
return fetchResult;
|
||||
}
|
||||
logger.info("Failed: "
|
||||
+ response.getStatusLine().toString()
|
||||
+ ", while fetching " + toFetchURL);
|
||||
}
|
||||
fetchResult.setStatusCode(response.getStatusLine()
|
||||
.getStatusCode());
|
||||
return fetchResult;
|
||||
}
|
||||
|
||||
fetchResult.setFetchedUrl(toFetchURL);
|
||||
String uri = get.getURI().toString();
|
||||
if (!uri.equals(toFetchURL)) {
|
||||
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
|
||||
fetchResult.setFetchedUrl(uri);
|
||||
}
|
||||
}
|
||||
|
||||
if (fetchResult.getEntity() != null) {
|
||||
long size = fetchResult.getEntity().getContentLength();
|
||||
if (size == -1) {
|
||||
Header length = response.getLastHeader("Content-Length");
|
||||
if (length == null) {
|
||||
length = response.getLastHeader("Content-length");
|
||||
}
|
||||
if (length != null) {
|
||||
size = Integer.parseInt(length.getValue());
|
||||
} else {
|
||||
size = -1;
|
||||
}
|
||||
}
|
||||
if (size > config.getMaxDownloadSize()) {
|
||||
fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
|
||||
return fetchResult;
|
||||
}
|
||||
|
||||
fetchResult.setStatusCode(HttpStatus.SC_OK);
|
||||
return fetchResult;
|
||||
|
||||
} else {
|
||||
get.abort();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("Fatal transport error: " + e.getMessage()
|
||||
+ " while fetching " + toFetchURL + " (link found in doc #"
|
||||
+ webUrl.getParentDocid() + ")");
|
||||
fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
|
||||
return fetchResult;
|
||||
} catch (IllegalStateException e) {
|
||||
// ignoring exceptions that occur because of not registering https
|
||||
// and other schemes
|
||||
} catch (Exception e) {
|
||||
if (e.getMessage() == null) {
|
||||
logger.error("Error while fetching " + webUrl.getURL());
|
||||
} else {
|
||||
logger.error(e.getMessage() + " while fetching "
|
||||
+ webUrl.getURL());
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
if (fetchResult.getEntity() == null && get != null) {
|
||||
get.abort();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
|
||||
return fetchResult;
|
||||
}
|
||||
|
||||
public synchronized void shutDown() {
|
||||
if (connectionMonitorThread != null) {
|
||||
connectionManager.shutdown();
|
||||
connectionMonitorThread.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
public HttpClient getHttpClient() {
|
||||
return httpClient;
|
||||
}
|
||||
|
||||
private static class GzipDecompressingEntity extends HttpEntityWrapper {
|
||||
|
||||
public GzipDecompressingEntity(final HttpEntity entity) {
|
||||
super(entity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getContent() throws IOException,
|
||||
IllegalStateException {
|
||||
|
||||
// the wrapped entity's getContent() decides about repeatability
|
||||
InputStream wrappedin = wrappedEntity.getContent();
|
||||
|
||||
return new GZIPInputStream(wrappedin);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getContentLength() {
|
||||
// length of ungzipped content is not known
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,155 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.sleepycat.je.Cursor;
|
||||
import com.sleepycat.je.Database;
|
||||
import com.sleepycat.je.DatabaseConfig;
|
||||
import com.sleepycat.je.DatabaseEntry;
|
||||
import com.sleepycat.je.DatabaseException;
|
||||
import com.sleepycat.je.Environment;
|
||||
import com.sleepycat.je.OperationStatus;
|
||||
import com.sleepycat.je.Transaction;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Configurable;
|
||||
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
|
||||
public class Counters extends Configurable {
|
||||
|
||||
public class ReservedCounterNames {
|
||||
public final static String SCHEDULED_PAGES = "Scheduled-Pages";
|
||||
|
||||
public final static String PROCESSED_PAGES = "Processed-Pages";
|
||||
}
|
||||
|
||||
protected Database statisticsDB = null;
|
||||
|
||||
protected Environment env;
|
||||
|
||||
protected final Object mutex = new Object();
|
||||
|
||||
protected Map<String, Long> counterValues;
|
||||
|
||||
public Counters(Environment env, CrawlConfig config)
|
||||
throws DatabaseException {
|
||||
super(config);
|
||||
|
||||
this.env = env;
|
||||
this.counterValues = new HashMap<String, Long>();
|
||||
|
||||
/*
|
||||
* When crawling is set to be resumable, we have to keep the statistics
|
||||
* in a transactional database to make sure they are not lost if crawler
|
||||
* is crashed or terminated unexpectedly.
|
||||
*/
|
||||
if (config.isResumableCrawling()) {
|
||||
DatabaseConfig dbConfig = new DatabaseConfig();
|
||||
dbConfig.setAllowCreate(true);
|
||||
dbConfig.setTransactional(true);
|
||||
dbConfig.setDeferredWrite(false);
|
||||
statisticsDB = env.openDatabase(null, "Statistics", dbConfig);
|
||||
|
||||
OperationStatus result;
|
||||
DatabaseEntry key = new DatabaseEntry();
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
Transaction tnx = env.beginTransaction(null, null);
|
||||
Cursor cursor = statisticsDB.openCursor(tnx, null);
|
||||
result = cursor.getFirst(key, value, null);
|
||||
|
||||
while (result == OperationStatus.SUCCESS) {
|
||||
if (value.getData().length > 0) {
|
||||
String name = new String(key.getData());
|
||||
long counterValue = Util.byteArray2Long(value.getData());
|
||||
counterValues.put(name, counterValue);
|
||||
}
|
||||
result = cursor.getNext(key, value, null);
|
||||
}
|
||||
cursor.close();
|
||||
tnx.commit();
|
||||
}
|
||||
}
|
||||
|
||||
public long getValue(String name) {
|
||||
synchronized (mutex) {
|
||||
Long value = counterValues.get(name);
|
||||
if (value == null) {
|
||||
return 0;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
public void setValue(String name, long value) {
|
||||
synchronized (mutex) {
|
||||
try {
|
||||
counterValues.put(name, value);
|
||||
if (statisticsDB != null) {
|
||||
Transaction txn = env.beginTransaction(null, null);
|
||||
statisticsDB.put(txn, new DatabaseEntry(name.getBytes()),
|
||||
new DatabaseEntry(Util.long2ByteArray(value)));
|
||||
txn.commit();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void increment(String name) {
|
||||
increment(name, 1);
|
||||
}
|
||||
|
||||
public void increment(String name, long addition) {
|
||||
synchronized (mutex) {
|
||||
long prevValue = getValue(name);
|
||||
setValue(name, prevValue + addition);
|
||||
}
|
||||
}
|
||||
|
||||
public void sync() {
|
||||
if (config.isResumableCrawling()) {
|
||||
return;
|
||||
}
|
||||
if (statisticsDB == null) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
statisticsDB.sync();
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
if (statisticsDB != null) {
|
||||
statisticsDB.close();
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,176 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.sleepycat.je.Database;
|
||||
import com.sleepycat.je.DatabaseConfig;
|
||||
import com.sleepycat.je.DatabaseEntry;
|
||||
import com.sleepycat.je.DatabaseException;
|
||||
import com.sleepycat.je.Environment;
|
||||
import com.sleepycat.je.OperationStatus;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Configurable;
|
||||
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
|
||||
public class DocIDServer extends Configurable {
|
||||
|
||||
protected static final Logger logger = Logger.getLogger(DocIDServer.class
|
||||
.getName());
|
||||
|
||||
protected Database docIDsDB = null;
|
||||
|
||||
protected final Object mutex = new Object();
|
||||
|
||||
protected int lastDocID;
|
||||
|
||||
public DocIDServer(Environment env, CrawlConfig config)
|
||||
throws DatabaseException {
|
||||
super(config);
|
||||
DatabaseConfig dbConfig = new DatabaseConfig();
|
||||
dbConfig.setAllowCreate(true);
|
||||
dbConfig.setTransactional(config.isResumableCrawling());
|
||||
dbConfig.setDeferredWrite(!config.isResumableCrawling());
|
||||
docIDsDB = env.openDatabase(null, "DocIDs", dbConfig);
|
||||
if (config.isResumableCrawling()) {
|
||||
int docCount = getDocCount();
|
||||
if (docCount > 0) {
|
||||
logger.info("Loaded " + docCount
|
||||
+ " URLs that had been detected in previous crawl.");
|
||||
lastDocID = docCount;
|
||||
}
|
||||
} else {
|
||||
lastDocID = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the docid of an already seen url.
|
||||
*
|
||||
* @param url
|
||||
* the URL for which the docid is returned.
|
||||
* @return the docid of the url if it is seen before. Otherwise -1 is
|
||||
* returned.
|
||||
*/
|
||||
public int getDocId(String url) {
|
||||
synchronized (mutex) {
|
||||
if (docIDsDB == null) {
|
||||
return -1;
|
||||
}
|
||||
OperationStatus result;
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
try {
|
||||
DatabaseEntry key = new DatabaseEntry(url.getBytes());
|
||||
result = docIDsDB.get(null, key, value, null);
|
||||
|
||||
if (result == OperationStatus.SUCCESS
|
||||
&& value.getData().length > 0) {
|
||||
return Util.byteArray2Int(value.getData());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
public int getNewDocID(String url) {
|
||||
synchronized (mutex) {
|
||||
try {
|
||||
// Make sure that we have not already assigned a docid for this
|
||||
// URL
|
||||
int docid = getDocId(url);
|
||||
if (docid > 0) {
|
||||
return docid;
|
||||
}
|
||||
|
||||
lastDocID++;
|
||||
docIDsDB.put(null, new DatabaseEntry(url.getBytes()),
|
||||
new DatabaseEntry(Util.int2ByteArray(lastDocID)));
|
||||
return lastDocID;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
public void addUrlAndDocId(String url, int docId) throws Exception {
|
||||
synchronized (mutex) {
|
||||
if (docId <= lastDocID) {
|
||||
throw new Exception("Requested doc id: " + docId
|
||||
+ " is not larger than: " + lastDocID);
|
||||
}
|
||||
|
||||
// Make sure that we have not already assigned a docid for this URL
|
||||
int prevDocid = getDocId(url);
|
||||
if (prevDocid > 0) {
|
||||
if (prevDocid == docId) {
|
||||
return;
|
||||
}
|
||||
throw new Exception("Doc id: " + prevDocid
|
||||
+ " is already assigned to URL: " + url);
|
||||
}
|
||||
|
||||
docIDsDB.put(null, new DatabaseEntry(url.getBytes()),
|
||||
new DatabaseEntry(Util.int2ByteArray(docId)));
|
||||
lastDocID = docId;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isSeenBefore(String url) {
|
||||
return getDocId(url) != -1;
|
||||
}
|
||||
|
||||
public int getDocCount() {
|
||||
try {
|
||||
return (int) docIDsDB.count();
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public void sync() {
|
||||
if (config.isResumableCrawling()) {
|
||||
return;
|
||||
}
|
||||
if (docIDsDB == null) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// docIDsDB.sync();
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
docIDsDB.close();
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,216 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.sleepycat.je.DatabaseException;
|
||||
import com.sleepycat.je.Environment;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Configurable;
|
||||
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||
import edu.uci.ics.crawler4j.frontier.Counters.ReservedCounterNames;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
|
||||
public class Frontier extends Configurable {
|
||||
|
||||
protected static final Logger logger = Logger.getLogger(Frontier.class
|
||||
.getName());
|
||||
|
||||
protected WorkQueues workQueues;
|
||||
|
||||
protected InProcessPagesDB inProcessPages;
|
||||
|
||||
protected final Object mutex = new Object();
|
||||
|
||||
protected final Object waitingList = new Object();
|
||||
|
||||
protected boolean isFinished = false;
|
||||
|
||||
protected long scheduledPages;
|
||||
|
||||
protected DocIDServer docIdServer;
|
||||
|
||||
protected Counters counters;
|
||||
|
||||
public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
|
||||
super(config);
|
||||
this.counters = new Counters(env, config);
|
||||
this.docIdServer = docIdServer;
|
||||
try {
|
||||
workQueues = new WorkQueues(env, "PendingURLsDB",
|
||||
config.isResumableCrawling());
|
||||
if (config.isResumableCrawling()) {
|
||||
scheduledPages = counters
|
||||
.getValue(ReservedCounterNames.SCHEDULED_PAGES);
|
||||
inProcessPages = new InProcessPagesDB(env);
|
||||
long numPreviouslyInProcessPages = inProcessPages.getLength();
|
||||
if (numPreviouslyInProcessPages > 0) {
|
||||
logger.info("Rescheduling " + numPreviouslyInProcessPages
|
||||
+ " URLs from previous crawl.");
|
||||
scheduledPages -= numPreviouslyInProcessPages;
|
||||
while (true) {
|
||||
List<WebURL> urls = inProcessPages.get(100);
|
||||
if (urls.size() == 0) {
|
||||
break;
|
||||
}
|
||||
scheduleAll(urls);
|
||||
inProcessPages.delete(urls.size());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
inProcessPages = null;
|
||||
scheduledPages = 0;
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
logger.error("Error while initializing the Frontier: "
|
||||
+ e.getMessage());
|
||||
workQueues = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void scheduleAll(List<WebURL> urls) {
|
||||
int maxPagesToFetch = config.getMaxPagesToFetch();
|
||||
synchronized (mutex) {
|
||||
int newScheduledPage = 0;
|
||||
for (WebURL url : urls) {
|
||||
if (maxPagesToFetch > 0
|
||||
&& (scheduledPages + newScheduledPage) >= maxPagesToFetch) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
workQueues.put(url);
|
||||
newScheduledPage++;
|
||||
} catch (DatabaseException e) {
|
||||
logger.error("Error while puting the url in the work queue.");
|
||||
}
|
||||
}
|
||||
if (newScheduledPage > 0) {
|
||||
scheduledPages += newScheduledPage;
|
||||
counters.increment(
|
||||
Counters.ReservedCounterNames.SCHEDULED_PAGES,
|
||||
newScheduledPage);
|
||||
}
|
||||
synchronized (waitingList) {
|
||||
waitingList.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void schedule(WebURL url) {
|
||||
int maxPagesToFetch = config.getMaxPagesToFetch();
|
||||
synchronized (mutex) {
|
||||
try {
|
||||
if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {
|
||||
workQueues.put(url);
|
||||
scheduledPages++;
|
||||
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES);
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
logger.error("Error while puting the url in the work queue.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void getNextURLs(int max, List<WebURL> result) {
|
||||
while (true) {
|
||||
synchronized (mutex) {
|
||||
if (isFinished) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
List<WebURL> curResults = workQueues.get(max);
|
||||
workQueues.delete(curResults.size());
|
||||
if (inProcessPages != null) {
|
||||
for (WebURL curPage : curResults) {
|
||||
inProcessPages.put(curPage);
|
||||
}
|
||||
}
|
||||
result.addAll(curResults);
|
||||
} catch (DatabaseException e) {
|
||||
logger.error("Error while getting next urls: "
|
||||
+ e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
if (result.size() > 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
try {
|
||||
synchronized (waitingList) {
|
||||
waitingList.wait();
|
||||
}
|
||||
} catch (InterruptedException ignored) {
|
||||
}
|
||||
if (isFinished) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void setProcessed(WebURL webURL) {
|
||||
counters.increment(ReservedCounterNames.PROCESSED_PAGES);
|
||||
if (inProcessPages != null) {
|
||||
if (!inProcessPages.removeURL(webURL)) {
|
||||
logger.warn("Could not remove: " + webURL.getURL()
|
||||
+ " from list of processed pages.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public long getQueueLength() {
|
||||
return workQueues.getLength();
|
||||
}
|
||||
|
||||
public long getNumberOfAssignedPages() {
|
||||
return inProcessPages.getLength();
|
||||
}
|
||||
|
||||
public long getNumberOfProcessedPages() {
|
||||
return counters.getValue(ReservedCounterNames.PROCESSED_PAGES);
|
||||
}
|
||||
|
||||
public void sync() {
|
||||
workQueues.sync();
|
||||
docIdServer.sync();
|
||||
counters.sync();
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return isFinished;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
sync();
|
||||
workQueues.close();
|
||||
counters.close();
|
||||
}
|
||||
|
||||
public void finish() {
|
||||
isFinished = true;
|
||||
synchronized (waitingList) {
|
||||
waitingList.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.sleepycat.je.Cursor;
|
||||
import com.sleepycat.je.DatabaseEntry;
|
||||
import com.sleepycat.je.DatabaseException;
|
||||
import com.sleepycat.je.Environment;
|
||||
import com.sleepycat.je.OperationStatus;
|
||||
import com.sleepycat.je.Transaction;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* This class maintains the list of pages which are assigned to crawlers but are
|
||||
* not yet processed. It is used for resuming a previous crawl.
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class InProcessPagesDB extends WorkQueues {
|
||||
|
||||
private static final Logger logger = Logger
|
||||
.getLogger(InProcessPagesDB.class.getName());
|
||||
|
||||
public InProcessPagesDB(Environment env) throws DatabaseException {
|
||||
super(env, "InProcessPagesDB", true);
|
||||
long docCount = getLength();
|
||||
if (docCount > 0) {
|
||||
logger.info("Loaded " + docCount
|
||||
+ " URLs that have been in process in the previous crawl.");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean removeURL(WebURL webUrl) {
|
||||
synchronized (mutex) {
|
||||
try {
|
||||
DatabaseEntry key = new DatabaseEntry(Util.int2ByteArray(webUrl
|
||||
.getDocid()));
|
||||
Cursor cursor = null;
|
||||
OperationStatus result;
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
Transaction txn = env.beginTransaction(null, null);
|
||||
try {
|
||||
cursor = urlsDB.openCursor(txn, null);
|
||||
result = cursor.getSearchKey(key, value, null);
|
||||
|
||||
if (result == OperationStatus.SUCCESS) {
|
||||
result = cursor.delete();
|
||||
if (result == OperationStatus.SUCCESS) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
if (txn != null) {
|
||||
txn.abort();
|
||||
txn = null;
|
||||
}
|
||||
throw e;
|
||||
} finally {
|
||||
if (cursor != null) {
|
||||
cursor.close();
|
||||
}
|
||||
if (txn != null) {
|
||||
txn.commit();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import com.sleepycat.bind.tuple.TupleBinding;
|
||||
import com.sleepycat.bind.tuple.TupleInput;
|
||||
import com.sleepycat.bind.tuple.TupleOutput;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class WebURLTupleBinding extends TupleBinding<WebURL> {
|
||||
|
||||
@Override
|
||||
public WebURL entryToObject(TupleInput input) {
|
||||
WebURL webURL = new WebURL();
|
||||
webURL.setURL(input.readString());
|
||||
webURL.setDocid(input.readInt());
|
||||
webURL.setParentDocid(input.readInt());
|
||||
webURL.setParentUrl(input.readString());
|
||||
webURL.setDepth(input.readShort());
|
||||
return webURL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void objectToEntry(WebURL url, TupleOutput output) {
|
||||
output.writeString(url.getURL());
|
||||
output.writeInt(url.getDocid());
|
||||
output.writeInt(url.getParentDocid());
|
||||
output.writeString(url.getParentUrl());
|
||||
output.writeShort(url.getDepth());
|
||||
}
|
||||
}
|
|
@ -1,197 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.frontier;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.sleepycat.je.Cursor;
|
||||
import com.sleepycat.je.Database;
|
||||
import com.sleepycat.je.DatabaseConfig;
|
||||
import com.sleepycat.je.DatabaseEntry;
|
||||
import com.sleepycat.je.DatabaseException;
|
||||
import com.sleepycat.je.Environment;
|
||||
import com.sleepycat.je.OperationStatus;
|
||||
import com.sleepycat.je.Transaction;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class WorkQueues {
|
||||
|
||||
protected Database urlsDB = null;
|
||||
|
||||
protected Environment env;
|
||||
|
||||
protected boolean resumable;
|
||||
|
||||
protected WebURLTupleBinding webURLBinding;
|
||||
|
||||
protected final Object mutex = new Object();
|
||||
|
||||
public WorkQueues(Environment env, String dbName, boolean resumable)
|
||||
throws DatabaseException {
|
||||
this.env = env;
|
||||
this.resumable = resumable;
|
||||
DatabaseConfig dbConfig = new DatabaseConfig();
|
||||
dbConfig.setAllowCreate(true);
|
||||
dbConfig.setTransactional(resumable);
|
||||
dbConfig.setDeferredWrite(!resumable);
|
||||
urlsDB = env.openDatabase(null, dbName, dbConfig);
|
||||
webURLBinding = new WebURLTupleBinding();
|
||||
}
|
||||
|
||||
public List<WebURL> get(int max) throws DatabaseException {
|
||||
synchronized (mutex) {
|
||||
int matches = 0;
|
||||
List<WebURL> results = new ArrayList<WebURL>(max);
|
||||
|
||||
Cursor cursor = null;
|
||||
OperationStatus result;
|
||||
DatabaseEntry key = new DatabaseEntry();
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
Transaction txn;
|
||||
if (resumable) {
|
||||
txn = env.beginTransaction(null, null);
|
||||
} else {
|
||||
txn = null;
|
||||
}
|
||||
try {
|
||||
cursor = urlsDB.openCursor(txn, null);
|
||||
result = cursor.getFirst(key, value, null);
|
||||
|
||||
while (matches < max && result == OperationStatus.SUCCESS) {
|
||||
if (value.getData().length > 0) {
|
||||
results.add(webURLBinding.entryToObject(value));
|
||||
matches++;
|
||||
}
|
||||
result = cursor.getNext(key, value, null);
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
if (txn != null) {
|
||||
txn.abort();
|
||||
txn = null;
|
||||
}
|
||||
throw e;
|
||||
} finally {
|
||||
if (cursor != null) {
|
||||
cursor.close();
|
||||
}
|
||||
if (txn != null) {
|
||||
txn.commit();
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
public void delete(int count) throws DatabaseException {
|
||||
synchronized (mutex) {
|
||||
int matches = 0;
|
||||
|
||||
Cursor cursor = null;
|
||||
OperationStatus result;
|
||||
DatabaseEntry key = new DatabaseEntry();
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
Transaction txn;
|
||||
if (resumable) {
|
||||
txn = env.beginTransaction(null, null);
|
||||
} else {
|
||||
txn = null;
|
||||
}
|
||||
try {
|
||||
cursor = urlsDB.openCursor(txn, null);
|
||||
result = cursor.getFirst(key, value, null);
|
||||
|
||||
while (matches < count && result == OperationStatus.SUCCESS) {
|
||||
cursor.delete();
|
||||
matches++;
|
||||
result = cursor.getNext(key, value, null);
|
||||
}
|
||||
} catch (DatabaseException e) {
|
||||
if (txn != null) {
|
||||
txn.abort();
|
||||
txn = null;
|
||||
}
|
||||
throw e;
|
||||
} finally {
|
||||
if (cursor != null) {
|
||||
cursor.close();
|
||||
}
|
||||
if (txn != null) {
|
||||
txn.commit();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void put(WebURL url) throws DatabaseException {
|
||||
byte[] keyData = Util.int2ByteArray(url.getDocid());
|
||||
DatabaseEntry value = new DatabaseEntry();
|
||||
webURLBinding.objectToEntry(url, value);
|
||||
Transaction txn;
|
||||
if (resumable) {
|
||||
txn = env.beginTransaction(null, null);
|
||||
} else {
|
||||
txn = null;
|
||||
}
|
||||
urlsDB.put(txn, new DatabaseEntry(keyData), value);
|
||||
if (resumable) {
|
||||
if (txn != null) {
|
||||
txn.commit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public long getLength() {
|
||||
try {
|
||||
return urlsDB.count();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public void sync() {
|
||||
|
||||
// System.out.println("Syncing Sleepy Cat DB");
|
||||
if (resumable) {
|
||||
return;
|
||||
}
|
||||
if (urlsDB == null) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
urlsDB.sync();
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
urlsDB.close();
|
||||
// System.out.println("Closing the Sleepy Cat DB");
|
||||
} catch (DatabaseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
import edu.uci.ics.crawler4j.parser.ParseData;
|
||||
|
||||
public class BinaryParseData implements ParseData {
|
||||
|
||||
private static BinaryParseData instance = new BinaryParseData();
|
||||
|
||||
public static BinaryParseData getInstance() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[Binary parse data can not be dumped as string]";
|
||||
}
|
||||
}
|
|
@ -1,166 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
public class HtmlContentHandler extends DefaultHandler {
|
||||
|
||||
private enum Element {
|
||||
A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY
|
||||
}
|
||||
|
||||
private static class HtmlFactory {
|
||||
private static Map<String, Element> name2Element;
|
||||
|
||||
static {
|
||||
name2Element = new HashMap<String, Element>();
|
||||
for (Element element : Element.values()) {
|
||||
name2Element.put(element.toString().toLowerCase(), element);
|
||||
}
|
||||
}
|
||||
|
||||
public static Element getElement(String name) {
|
||||
return name2Element.get(name);
|
||||
}
|
||||
}
|
||||
|
||||
private String base;
|
||||
|
||||
private String metaRefresh;
|
||||
|
||||
private String metaLocation;
|
||||
|
||||
private boolean isWithinBodyElement;
|
||||
|
||||
private StringBuilder bodyText;
|
||||
|
||||
private Set<String> outgoingUrls;
|
||||
|
||||
public HtmlContentHandler() {
|
||||
isWithinBodyElement = false;
|
||||
bodyText = new StringBuilder();
|
||||
outgoingUrls = new HashSet<String>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName,
|
||||
Attributes attributes) throws SAXException {
|
||||
Element element = HtmlFactory.getElement(localName);
|
||||
|
||||
if (element == Element.A || element == Element.AREA
|
||||
|| element == Element.LINK) {
|
||||
String href = attributes.getValue("href");
|
||||
if (href != null) {
|
||||
outgoingUrls.add(href);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (element == Element.IMG) {
|
||||
String imgSrc = attributes.getValue("src");
|
||||
if (imgSrc != null) {
|
||||
outgoingUrls.add(imgSrc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (element == Element.IFRAME || element == Element.FRAME
|
||||
|| element == Element.EMBED) {
|
||||
String src = attributes.getValue("src");
|
||||
if (src != null) {
|
||||
outgoingUrls.add(src);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (element == Element.BASE) {
|
||||
if (base != null) { // We only consider the first occurrence of the
|
||||
// Base element.
|
||||
String href = attributes.getValue("href");
|
||||
if (href != null) {
|
||||
base = href;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (element == Element.META) {
|
||||
String equiv = attributes.getValue("http-equiv");
|
||||
String content = attributes.getValue("content");
|
||||
if (equiv != null && content != null) {
|
||||
equiv = equiv.toLowerCase();
|
||||
|
||||
// http-equiv="refresh" content="0;URL=http://foo.bar/..."
|
||||
if (equiv.equals("refresh") && (metaRefresh == null)) {
|
||||
int pos = content.toLowerCase().indexOf("url=");
|
||||
if (pos != -1) {
|
||||
metaRefresh = content.substring(pos + 4);
|
||||
}
|
||||
}
|
||||
|
||||
// http-equiv="location" content="http://foo.bar/..."
|
||||
if (equiv.equals("location") && (metaLocation == null)) {
|
||||
metaLocation = content;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (element == Element.BODY) {
|
||||
isWithinBodyElement = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName)
|
||||
throws SAXException {
|
||||
Element element = HtmlFactory.getElement(localName);
|
||||
if (element == Element.BODY) {
|
||||
isWithinBodyElement = false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char ch[], int start, int length)
|
||||
throws SAXException {
|
||||
if (isWithinBodyElement) {
|
||||
bodyText.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
public String getBodyText() {
|
||||
return bodyText.toString();
|
||||
}
|
||||
|
||||
public Set<String> getOutgoingUrls() {
|
||||
return outgoingUrls;
|
||||
}
|
||||
|
||||
public String getBaseUrl() {
|
||||
return base;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,71 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
|
||||
public class HtmlParseData implements ParseData {
|
||||
|
||||
private String html;
|
||||
|
||||
private String text;
|
||||
|
||||
private String title;
|
||||
|
||||
private List<WebURL> outgoingUrls;
|
||||
|
||||
public String getHtml() {
|
||||
return html;
|
||||
}
|
||||
|
||||
public void setHtml(String html) {
|
||||
this.html = html;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<WebURL> getOutgoingUrls() {
|
||||
return outgoingUrls;
|
||||
}
|
||||
|
||||
public void setOutgoingUrls(List<WebURL> outgoingUrls) {
|
||||
this.outgoingUrls = outgoingUrls;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return text;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
public interface ParseData {
|
||||
|
||||
@Override
|
||||
public String toString();
|
||||
|
||||
}
|
|
@ -1,162 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.html.HtmlParser;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Configurable;
|
||||
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
|
||||
import edu.uci.ics.crawler4j.crawler.Page;
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class Parser extends Configurable {
|
||||
|
||||
private HtmlParser htmlParser;
|
||||
|
||||
private ParseContext parseContext;
|
||||
|
||||
public Parser(CrawlConfig config) {
|
||||
super(config);
|
||||
htmlParser = new HtmlParser();
|
||||
parseContext = new ParseContext();
|
||||
}
|
||||
|
||||
public boolean parse(Page page, String contextURL) {
|
||||
|
||||
if (Util.hasBinaryContent(page.getContentType())) {
|
||||
if (!config.isIncludeBinaryContentInCrawling()) {
|
||||
return false;
|
||||
} else {
|
||||
page.setParseData(BinaryParseData.getInstance());
|
||||
return true;
|
||||
}
|
||||
} else if (Util.hasPlainTextContent(page.getContentType())) {
|
||||
try {
|
||||
TextParseData parseData = new TextParseData();
|
||||
parseData.setTextContent(new String(page.getContentData(), page
|
||||
.getContentCharset()));
|
||||
page.setParseData(parseData);
|
||||
return true;
|
||||
} catch (NullPointerException npe) {
|
||||
// ignore, means it couldn't parse because it's not text
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
HtmlContentHandler contentHandler = new HtmlContentHandler();
|
||||
InputStream inputStream = null;
|
||||
try {
|
||||
inputStream = new ByteArrayInputStream(page.getContentData());
|
||||
htmlParser.parse(inputStream, contentHandler, metadata,
|
||||
parseContext);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
try {
|
||||
if (inputStream != null) {
|
||||
inputStream.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
if (page.getContentCharset() == null) {
|
||||
page.setContentCharset(metadata.get("Content-Encoding"));
|
||||
}
|
||||
|
||||
HtmlParseData parseData = new HtmlParseData();
|
||||
parseData.setText(contentHandler.getBodyText().trim());
|
||||
parseData.setTitle(metadata.get(Metadata.TITLE));
|
||||
|
||||
Set<String> urls = new HashSet<String>();
|
||||
|
||||
String baseURL = contentHandler.getBaseUrl();
|
||||
if (baseURL != null) {
|
||||
contextURL = baseURL;
|
||||
}
|
||||
|
||||
int urlCount = 0;
|
||||
for (String href : contentHandler.getOutgoingUrls()) {
|
||||
href = href.trim();
|
||||
if (href.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
String hrefWithoutProtocol = href.toLowerCase();
|
||||
if (href.startsWith("http://")) {
|
||||
hrefWithoutProtocol = href.substring(7);
|
||||
}
|
||||
if (!hrefWithoutProtocol.contains("javascript:")
|
||||
&& !hrefWithoutProtocol.contains("@")) {
|
||||
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
|
||||
if (url != null) {
|
||||
urls.add(url);
|
||||
urlCount++;
|
||||
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<WebURL> outgoingUrls = new ArrayList<WebURL>();
|
||||
for (String url : urls) {
|
||||
WebURL webURL = new WebURL();
|
||||
webURL.setURL(url);
|
||||
outgoingUrls.add(webURL);
|
||||
}
|
||||
parseData.setOutgoingUrls(outgoingUrls);
|
||||
|
||||
try {
|
||||
if (page.getContentCharset() == null) {
|
||||
parseData.setHtml(new String(page.getContentData()));
|
||||
} else {
|
||||
parseData.setHtml(new String(page.getContentData(), page
|
||||
.getContentCharset()));
|
||||
}
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
return false;
|
||||
}
|
||||
|
||||
page.setParseData(parseData);
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.parser;
|
||||
|
||||
public class TextParseData implements ParseData {
|
||||
|
||||
private String textContent;
|
||||
|
||||
public String getTextContent() {
|
||||
return textContent;
|
||||
}
|
||||
|
||||
public void setTextContent(String textContent) {
|
||||
this.textContent = textContent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return textContent;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,62 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.robotstxt;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class HostDirectives {
|
||||
|
||||
// If we fetched the directives for this host more than
|
||||
// 24 hours, we have to re-fetch it.
|
||||
private static final long EXPIRATION_DELAY = 24 * 60 * 1000L;
|
||||
|
||||
private RuleSet disallows = new RuleSet();
|
||||
|
||||
private RuleSet allows = new RuleSet();
|
||||
|
||||
private long timeFetched;
|
||||
|
||||
private long timeLastAccessed;
|
||||
|
||||
public HostDirectives() {
|
||||
timeFetched = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public boolean needsRefetch() {
|
||||
return (System.currentTimeMillis() - timeFetched > EXPIRATION_DELAY);
|
||||
}
|
||||
|
||||
public boolean allows(String path) {
|
||||
timeLastAccessed = System.currentTimeMillis();
|
||||
return !disallows.containsPrefixOf(path)
|
||||
|| allows.containsPrefixOf(path);
|
||||
}
|
||||
|
||||
public void addDisallow(String path) {
|
||||
disallows.add(path);
|
||||
}
|
||||
|
||||
public void addAllow(String path) {
|
||||
allows.add(path);
|
||||
}
|
||||
|
||||
public long getLastAccessTime() {
|
||||
return timeLastAccessed;
|
||||
}
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.robotstxt;
|
||||
|
||||
public class RobotstxtConfig {
|
||||
|
||||
/**
|
||||
* Should the crawler obey Robots.txt protocol? More info on Robots.txt is
|
||||
* available at http://www.robotstxt.org/
|
||||
*/
|
||||
private boolean enabled = true;
|
||||
|
||||
/**
|
||||
* user-agent name that will be used to determine whether some servers have
|
||||
* specific rules for this agent name.
|
||||
*/
|
||||
private String userAgentName = "crawler4j";
|
||||
|
||||
/**
|
||||
* The maximum number of hosts for which their robots.txt is cached.
|
||||
*/
|
||||
private int cacheSize = 500;
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
public String getUserAgentName() {
|
||||
return userAgentName;
|
||||
}
|
||||
|
||||
public void setUserAgentName(String userAgentName) {
|
||||
this.userAgentName = userAgentName;
|
||||
}
|
||||
|
||||
public int getCacheSize() {
|
||||
return cacheSize;
|
||||
}
|
||||
|
||||
public void setCacheSize(int cacheSize) {
|
||||
this.cacheSize = cacheSize;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.robotstxt;
|
||||
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
|
||||
public class RobotstxtParser {
|
||||
|
||||
private static final String PATTERNS_USERAGENT = "(?i)^User-agent:.*";
|
||||
|
||||
private static final String PATTERNS_DISALLOW = "(?i)Disallow:.*";
|
||||
|
||||
private static final String PATTERNS_ALLOW = "(?i)Allow:.*";
|
||||
|
||||
private static final int PATTERNS_USERAGENT_LENGTH = 11;
|
||||
|
||||
private static final int PATTERNS_DISALLOW_LENGTH = 9;
|
||||
|
||||
private static final int PATTERNS_ALLOW_LENGTH = 6;
|
||||
|
||||
public static HostDirectives parse(String content, String myUserAgent) {
|
||||
|
||||
HostDirectives directives = null;
|
||||
boolean inMatchingUserAgent = false;
|
||||
|
||||
StringTokenizer st = new StringTokenizer(content, "\n");
|
||||
while (st.hasMoreTokens()) {
|
||||
String line = st.nextToken();
|
||||
|
||||
int commentIndex = line.indexOf("#");
|
||||
if (commentIndex > -1) {
|
||||
line = line.substring(0, commentIndex);
|
||||
}
|
||||
|
||||
// remove any html markup
|
||||
line = line.replaceAll("<[^>]+>", "");
|
||||
|
||||
line = line.trim();
|
||||
|
||||
if (line.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.matches(PATTERNS_USERAGENT)) {
|
||||
String ua = line.substring(PATTERNS_USERAGENT_LENGTH).trim()
|
||||
.toLowerCase();
|
||||
if (ua.equals("*") || ua.contains(myUserAgent)) {
|
||||
inMatchingUserAgent = true;
|
||||
if (directives == null) {
|
||||
directives = new HostDirectives();
|
||||
}
|
||||
} else {
|
||||
inMatchingUserAgent = false;
|
||||
}
|
||||
} else if (line.matches(PATTERNS_DISALLOW)) {
|
||||
if (!inMatchingUserAgent) {
|
||||
continue;
|
||||
}
|
||||
String path = line.substring(PATTERNS_DISALLOW_LENGTH).trim();
|
||||
if (path.endsWith("*")) {
|
||||
path = path.substring(0, path.length() - 1);
|
||||
}
|
||||
path = path.trim();
|
||||
if (path.length() > 0) {
|
||||
directives.addDisallow(path);
|
||||
}
|
||||
} else if (line.matches(PATTERNS_ALLOW)) {
|
||||
if (!inMatchingUserAgent) {
|
||||
continue;
|
||||
}
|
||||
String path = line.substring(PATTERNS_ALLOW_LENGTH).trim();
|
||||
if (path.endsWith("*")) {
|
||||
path = path.substring(0, path.length() - 1);
|
||||
}
|
||||
path = path.trim();
|
||||
directives.addAllow(path);
|
||||
}
|
||||
}
|
||||
|
||||
return directives;
|
||||
}
|
||||
}
|
|
@ -1,132 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.robotstxt;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.http.HttpStatus;
|
||||
|
||||
import edu.uci.ics.crawler4j.crawler.Page;
|
||||
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
|
||||
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
|
||||
import edu.uci.ics.crawler4j.url.WebURL;
|
||||
import edu.uci.ics.crawler4j.util.Util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class RobotstxtServer {
|
||||
|
||||
protected RobotstxtConfig config;
|
||||
|
||||
protected final Map<String, HostDirectives> host2directivesCache = new HashMap<String, HostDirectives>();
|
||||
|
||||
protected PageFetcher pageFetcher;
|
||||
|
||||
public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher) {
|
||||
this.config = config;
|
||||
this.pageFetcher = pageFetcher;
|
||||
}
|
||||
|
||||
public boolean allows(WebURL webURL) {
|
||||
if (!config.isEnabled()) {
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
URL url = new URL(webURL.getURL());
|
||||
String host = url.getHost().toLowerCase();
|
||||
String path = url.getPath();
|
||||
|
||||
HostDirectives directives = host2directivesCache.get(host);
|
||||
|
||||
if (directives != null && directives.needsRefetch()) {
|
||||
synchronized (host2directivesCache) {
|
||||
host2directivesCache.remove(host);
|
||||
directives = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (directives == null) {
|
||||
directives = fetchDirectives(host);
|
||||
}
|
||||
|
||||
return directives.allows(path);
|
||||
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private HostDirectives fetchDirectives(String host) {
|
||||
WebURL robotsTxtUrl = new WebURL();
|
||||
robotsTxtUrl.setURL("http://" + host + "/robots.txt");
|
||||
HostDirectives directives = null;
|
||||
PageFetchResult fetchResult = null;
|
||||
try {
|
||||
fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
|
||||
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
|
||||
Page page = new Page(robotsTxtUrl);
|
||||
fetchResult.fetchContent(page);
|
||||
if (Util.hasPlainTextContent(page.getContentType())) {
|
||||
try {
|
||||
String content;
|
||||
if (page.getContentCharset() == null) {
|
||||
content = new String(page.getContentData());
|
||||
} else {
|
||||
content = new String(page.getContentData(),
|
||||
page.getContentCharset());
|
||||
}
|
||||
directives = RobotstxtParser.parse(content,
|
||||
config.getUserAgentName());
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fetchResult.discardContentIfNotConsumed();
|
||||
}
|
||||
if (directives == null) {
|
||||
// We still need to have this object to keep track of the time we
|
||||
// fetched it
|
||||
directives = new HostDirectives();
|
||||
}
|
||||
synchronized (host2directivesCache) {
|
||||
if (host2directivesCache.size() == config.getCacheSize()) {
|
||||
String minHost = null;
|
||||
long minAccessTime = Long.MAX_VALUE;
|
||||
for (Entry<String, HostDirectives> entry : host2directivesCache
|
||||
.entrySet()) {
|
||||
if (entry.getValue().getLastAccessTime() < minAccessTime) {
|
||||
minAccessTime = entry.getValue().getLastAccessTime();
|
||||
minHost = entry.getKey();
|
||||
}
|
||||
}
|
||||
host2directivesCache.remove(minHost);
|
||||
}
|
||||
host2directivesCache.put(host, directives);
|
||||
}
|
||||
return directives;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.robotstxt;
|
||||
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
public class RuleSet extends TreeSet<String> {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean add(String str) {
|
||||
SortedSet<String> sub = headSet(str);
|
||||
if (!sub.isEmpty() && str.startsWith(sub.last())) {
|
||||
// no need to add; prefix is already present
|
||||
return false;
|
||||
}
|
||||
boolean retVal = super.add(str);
|
||||
sub = tailSet(str + "\0");
|
||||
while (!sub.isEmpty() && sub.first().startsWith(str)) {
|
||||
// remove redundant entries
|
||||
sub.remove(sub.first());
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
public boolean containsPrefixOf(String s) {
|
||||
SortedSet<String> sub = headSet(s);
|
||||
// because redundant prefixes have been eliminated,
|
||||
// only a test against last item in headSet is necessary
|
||||
if (!sub.isEmpty() && s.startsWith(sub.last())) {
|
||||
return true; // prefix substring exists
|
||||
}
|
||||
// might still exist exactly (headSet does not contain boundary)
|
||||
return contains(s);
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class TLDList {
|
||||
|
||||
private final static Set<String> tldSet = new HashSet<String>();
|
||||
|
||||
public static boolean contains(String str) {
|
||||
if (tldSet != null) {
|
||||
return tldSet.contains(str);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
BufferedReader reader = null;
|
||||
try {
|
||||
reader = new BufferedReader(new InputStreamReader(
|
||||
TLDList.class.getClassLoader().getResourceAsStream(
|
||||
"tld-names.txt")));
|
||||
String line;
|
||||
|
||||
while ((line = reader.readLine()) != null) {
|
||||
line = line.trim();
|
||||
if (line.isEmpty() || line.startsWith("//")) {
|
||||
continue;
|
||||
}
|
||||
tldSet.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
// Nothing we can do
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,217 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some
|
||||
* parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class URLCanonicalizer {
|
||||
|
||||
public static String getCanonicalURL(String url) {
|
||||
return getCanonicalURL(url, null);
|
||||
}
|
||||
|
||||
public static String getCanonicalURL(String href, String context) {
|
||||
|
||||
try {
|
||||
URL canonicalURL = new URL(UrlResolver.resolveUrl(
|
||||
context == null ? "" : context, href));
|
||||
|
||||
String path = canonicalURL.getPath();
|
||||
|
||||
/*
|
||||
* Normalize: no empty segments (i.e., "//"), no segments equal to
|
||||
* ".", and no segments equal to ".." that are preceded by a segment
|
||||
* not equal to "..".
|
||||
*/
|
||||
path = new URI(path).normalize().toString();
|
||||
|
||||
/*
|
||||
* Convert '//' -> '/'
|
||||
*/
|
||||
int idx = path.indexOf("//");
|
||||
while (idx >= 0) {
|
||||
path = path.replace("//", "/");
|
||||
idx = path.indexOf("//");
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop starting '/../'
|
||||
*/
|
||||
while (path.startsWith("/../")) {
|
||||
path = path.substring(3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Trim
|
||||
*/
|
||||
path = path.trim();
|
||||
|
||||
final SortedMap<String, String> params = createParameterMap(canonicalURL
|
||||
.getQuery());
|
||||
final String queryString;
|
||||
|
||||
if (params != null && params.size() > 0) {
|
||||
String canonicalParams = canonicalize(params);
|
||||
queryString = (canonicalParams.isEmpty() ? "" : "?"
|
||||
+ canonicalParams);
|
||||
} else {
|
||||
queryString = "";
|
||||
}
|
||||
|
||||
/*
|
||||
* Add starting slash if needed
|
||||
*/
|
||||
if (path.length() == 0) {
|
||||
path = "/" + path;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop default port: example.com:80 -> example.com
|
||||
*/
|
||||
int port = canonicalURL.getPort();
|
||||
if (port == canonicalURL.getDefaultPort()) {
|
||||
port = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lowercasing protocol and host
|
||||
*/
|
||||
String protocol = canonicalURL.getProtocol().toLowerCase();
|
||||
String host = canonicalURL.getHost().toLowerCase();
|
||||
String pathAndQueryString = normalizePath(path) + queryString;
|
||||
|
||||
URL result = new URL(protocol, host, port, pathAndQueryString);
|
||||
return result.toExternalForm();
|
||||
|
||||
} catch (MalformedURLException ex) {
|
||||
return null;
|
||||
} catch (URISyntaxException ex) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a query string, separates the constituent name-value pairs, and
|
||||
* stores them in a SortedMap ordered by lexicographical order.
|
||||
*
|
||||
* @return Null if there is no query string.
|
||||
*/
|
||||
private static SortedMap<String, String> createParameterMap(
|
||||
final String queryString) {
|
||||
if (queryString == null || queryString.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final String[] pairs = queryString.split("&");
|
||||
final Map<String, String> params = new HashMap<String, String>(
|
||||
pairs.length);
|
||||
|
||||
for (final String pair : pairs) {
|
||||
if (pair.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String[] tokens = pair.split("=", 2);
|
||||
switch (tokens.length) {
|
||||
case 1:
|
||||
if (pair.charAt(0) == '=') {
|
||||
params.put("", tokens[0]);
|
||||
} else {
|
||||
params.put(tokens[0], "");
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
params.put(tokens[0], tokens[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new TreeMap<String, String>(params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonicalize the query string.
|
||||
*
|
||||
* @param sortedParamMap
|
||||
* Parameter name-value pairs in lexicographical order.
|
||||
* @return Canonical form of query string.
|
||||
*/
|
||||
private static String canonicalize(
|
||||
final SortedMap<String, String> sortedParamMap) {
|
||||
if (sortedParamMap == null || sortedParamMap.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
final StringBuffer sb = new StringBuffer(100);
|
||||
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
|
||||
final String key = pair.getKey().toLowerCase();
|
||||
if (key.equals("jsessionid") || key.equals("phpsessid")
|
||||
|| key.equals("aspsessionid")) {
|
||||
continue;
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
sb.append('&');
|
||||
}
|
||||
sb.append(percentEncodeRfc3986(pair.getKey()));
|
||||
if (!pair.getValue().isEmpty()) {
|
||||
sb.append('=');
|
||||
sb.append(percentEncodeRfc3986(pair.getValue()));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Percent-encode values according the RFC 3986. The built-in Java
|
||||
* URLEncoder does not encode according to the RFC, so we make the extra
|
||||
* replacements.
|
||||
*
|
||||
* @param string
|
||||
* Decoded string.
|
||||
* @return Encoded string per RFC 3986.
|
||||
*/
|
||||
private static String percentEncodeRfc3986(String string) {
|
||||
try {
|
||||
string = string.replace("+", "%2B");
|
||||
string = URLDecoder.decode(string, "UTF-8");
|
||||
string = URLEncoder.encode(string, "UTF-8");
|
||||
return string.replace("+", "%20").replace("*", "%2A")
|
||||
.replace("%7E", "~");
|
||||
} catch (Exception e) {
|
||||
return string;
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalizePath(final String path) {
|
||||
return path.replace("%7E", "~").replace(" ", "%20");
|
||||
}
|
||||
}
|
|
@ -1,484 +0,0 @@
|
|||
/**
|
||||
* This class is adopted from Htmlunit with the following copyright:
|
||||
*
|
||||
* Copyright (c) 2002-2012 Gargoyle Software Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
public final class UrlResolver {
|
||||
|
||||
/**
|
||||
* Resolves a given relative URL against a base URL. See <a
|
||||
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a> Section 4 for
|
||||
* more details.
|
||||
*
|
||||
* @param baseUrl
|
||||
* The base URL in which to resolve the specification.
|
||||
* @param relativeUrl
|
||||
* The relative URL to resolve against the base URL.
|
||||
* @return the resolved specification.
|
||||
*/
|
||||
public static String resolveUrl(final String baseUrl,
|
||||
final String relativeUrl) {
|
||||
if (baseUrl == null) {
|
||||
throw new IllegalArgumentException("Base URL must not be null");
|
||||
}
|
||||
if (relativeUrl == null) {
|
||||
throw new IllegalArgumentException("Relative URL must not be null");
|
||||
}
|
||||
final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
|
||||
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index within the specified string of the first occurrence of
|
||||
* the specified search character.
|
||||
*
|
||||
* @param s
|
||||
* the string to search
|
||||
* @param searchChar
|
||||
* the character to search for
|
||||
* @param beginIndex
|
||||
* the index at which to start the search
|
||||
* @param endIndex
|
||||
* the index at which to stop the search
|
||||
* @return the index of the first occurrence of the character in the string
|
||||
* or <tt>-1</tt>
|
||||
*/
|
||||
private static int indexOf(final String s, final char searchChar,
|
||||
final int beginIndex, final int endIndex) {
|
||||
for (int i = beginIndex; i < endIndex; i++) {
|
||||
if (s.charAt(i) == searchChar) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a given specification using the algorithm depicted in <a
|
||||
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
|
||||
*
|
||||
* Section 2.4: Parsing a URL
|
||||
*
|
||||
* An accepted method for parsing URLs is useful to clarify the generic-RL
|
||||
* syntax of Section 2.2 and to describe the algorithm for resolving
|
||||
* relative URLs presented in Section 4. This section describes the parsing
|
||||
* rules for breaking down a URL (relative or absolute) into the component
|
||||
* parts described in Section 2.1. The rules assume that the URL has already
|
||||
* been separated from any surrounding text and copied to a "parse string".
|
||||
* The rules are listed in the order in which they would be applied by the
|
||||
* parser.
|
||||
*
|
||||
* @param spec
|
||||
* The specification to parse.
|
||||
* @return the parsed specification.
|
||||
*/
|
||||
private static Url parseUrl(final String spec) {
|
||||
final Url url = new Url();
|
||||
int startIndex = 0;
|
||||
int endIndex = spec.length();
|
||||
|
||||
// Section 2.4.1: Parsing the Fragment Identifier
|
||||
//
|
||||
// If the parse string contains a crosshatch "#" character, then the
|
||||
// substring after the first (left-most) crosshatch "#" and up to the
|
||||
// end of the parse string is the <fragment> identifier. If the
|
||||
// crosshatch is the last character, or no crosshatch is present, then
|
||||
// the fragment identifier is empty. The matched substring, including
|
||||
// the crosshatch character, is removed from the parse string before
|
||||
// continuing.
|
||||
//
|
||||
// Note that the fragment identifier is not considered part of the URL.
|
||||
// However, since it is often attached to the URL, parsers must be able
|
||||
// to recognize and set aside fragment identifiers as part of the
|
||||
// process.
|
||||
final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex);
|
||||
|
||||
if (crosshatchIndex >= 0) {
|
||||
url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
|
||||
endIndex = crosshatchIndex;
|
||||
}
|
||||
// Section 2.4.2: Parsing the Scheme
|
||||
//
|
||||
// If the parse string contains a colon ":" after the first character
|
||||
// and before any characters not allowed as part of a scheme name (i.e.,
|
||||
// any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
|
||||
// <scheme> of the URL is the substring of characters up to but not
|
||||
// including the first colon. These characters and the colon are then
|
||||
// removed from the parse string before continuing.
|
||||
final int colonIndex = indexOf(spec, ':', startIndex, endIndex);
|
||||
|
||||
if (colonIndex > 0) {
|
||||
final String scheme = spec.substring(startIndex, colonIndex);
|
||||
if (isValidScheme(scheme)) {
|
||||
url.scheme_ = scheme;
|
||||
startIndex = colonIndex + 1;
|
||||
}
|
||||
}
|
||||
// Section 2.4.3: Parsing the Network Location/Login
|
||||
//
|
||||
// If the parse string begins with a double-slash "//", then the
|
||||
// substring of characters after the double-slash and up to, but not
|
||||
// including, the next slash "/" character is the network location/login
|
||||
// (<net_loc>) of the URL. If no trailing slash "/" is present, the
|
||||
// entire remaining parse string is assigned to <net_loc>. The double-
|
||||
// slash and <net_loc> are removed from the parse string before
|
||||
// continuing.
|
||||
//
|
||||
// Note: We also accept a question mark "?" or a semicolon ";" character
|
||||
// as
|
||||
// delimiters for the network location/login (<net_loc>) of the URL.
|
||||
final int locationStartIndex;
|
||||
int locationEndIndex;
|
||||
|
||||
if (spec.startsWith("//", startIndex)) {
|
||||
locationStartIndex = startIndex + 2;
|
||||
locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex);
|
||||
if (locationEndIndex >= 0) {
|
||||
startIndex = locationEndIndex;
|
||||
}
|
||||
} else {
|
||||
locationStartIndex = -1;
|
||||
locationEndIndex = -1;
|
||||
}
|
||||
// Section 2.4.4: Parsing the Query Information
|
||||
//
|
||||
// If the parse string contains a question mark "?" character, then the
|
||||
// substring after the first (left-most) question mark "?" and up to the
|
||||
// end of the parse string is the <query> information. If the question
|
||||
// mark is the last character, or no question mark is present, then the
|
||||
// query information is empty. The matched substring, including the
|
||||
// question mark character, is removed from the parse string before
|
||||
// continuing.
|
||||
final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex);
|
||||
|
||||
if (questionMarkIndex >= 0) {
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The substring of characters after the double-slash and up to,
|
||||
// but not
|
||||
// including, the question mark "?" character is the network
|
||||
// location/login
|
||||
// (<net_loc>) of the URL.
|
||||
locationEndIndex = questionMarkIndex;
|
||||
startIndex = questionMarkIndex;
|
||||
}
|
||||
url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
|
||||
endIndex = questionMarkIndex;
|
||||
}
|
||||
// Section 2.4.5: Parsing the Parameters
|
||||
//
|
||||
// If the parse string contains a semicolon ";" character, then the
|
||||
// substring after the first (left-most) semicolon ";" and up to the end
|
||||
// of the parse string is the parameters (<params>). If the semicolon
|
||||
// is the last character, or no semicolon is present, then <params> is
|
||||
// empty. The matched substring, including the semicolon character, is
|
||||
// removed from the parse string before continuing.
|
||||
final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex);
|
||||
|
||||
if (semicolonIndex >= 0) {
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The substring of characters after the double-slash and up to,
|
||||
// but not
|
||||
// including, the semicolon ";" character is the network
|
||||
// location/login
|
||||
// (<net_loc>) of the URL.
|
||||
locationEndIndex = semicolonIndex;
|
||||
startIndex = semicolonIndex;
|
||||
}
|
||||
url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
|
||||
endIndex = semicolonIndex;
|
||||
}
|
||||
// Section 2.4.6: Parsing the Path
|
||||
//
|
||||
// After the above steps, all that is left of the parse string is the
|
||||
// URL <path> and the slash "/" that may precede it. Even though the
|
||||
// initial slash is not part of the URL path, the parser must remember
|
||||
// whether or not it was present so that later processes can
|
||||
// differentiate between relative and absolute paths. Often this is
|
||||
// done by simply storing the preceding slash along with the path.
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The entire remaining parse string is assigned to the network
|
||||
// location/login (<net_loc>) of the URL.
|
||||
locationEndIndex = endIndex;
|
||||
} else if (startIndex < endIndex) {
|
||||
url.path_ = spec.substring(startIndex, endIndex);
|
||||
}
|
||||
// Set the network location/login (<net_loc>) of the URL.
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
|
||||
url.location_ = spec
|
||||
.substring(locationStartIndex, locationEndIndex);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if specified string is a valid scheme name.
|
||||
*/
|
||||
private static boolean isValidScheme(final String scheme) {
|
||||
final int length = scheme.length();
|
||||
if (length < 1) {
|
||||
return false;
|
||||
}
|
||||
char c = scheme.charAt(0);
|
||||
if (!Character.isLetter(c)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 1; i < length; i++) {
|
||||
c = scheme.charAt(i);
|
||||
if (!Character.isLetterOrDigit(c) && c != '.' && c != '+'
|
||||
&& c != '-') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves a given relative URL against a base URL using the algorithm
|
||||
* depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
|
||||
*
|
||||
* Section 4: Resolving Relative URLs
|
||||
*
|
||||
* This section describes an example algorithm for resolving URLs within a
|
||||
* context in which the URLs may be relative, such that the result is always
|
||||
* a URL in absolute form. Although this algorithm cannot guarantee that the
|
||||
* resulting URL will equal that intended by the original author, it does
|
||||
* guarantee that any valid URL (relative or absolute) can be consistently
|
||||
* transformed to an absolute form given a valid base URL.
|
||||
*
|
||||
* @param baseUrl
|
||||
* The base URL in which to resolve the specification.
|
||||
* @param relativeUrl
|
||||
* The relative URL to resolve against the base URL.
|
||||
* @return the resolved specification.
|
||||
*/
|
||||
private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
|
||||
final Url url = parseUrl(relativeUrl);
|
||||
// Step 1: The base URL is established according to the rules of
|
||||
// Section 3. If the base URL is the empty string (unknown),
|
||||
// the embedded URL is interpreted as an absolute URL and
|
||||
// we are done.
|
||||
if (baseUrl == null) {
|
||||
return url;
|
||||
}
|
||||
// Step 2: Both the base and embedded URLs are parsed into their
|
||||
// component parts as described in Section 2.4.
|
||||
// a) If the embedded URL is entirely empty, it inherits the
|
||||
// entire base URL (i.e., is set equal to the base URL)
|
||||
// and we are done.
|
||||
if (relativeUrl.length() == 0) {
|
||||
return new Url(baseUrl);
|
||||
}
|
||||
// b) If the embedded URL starts with a scheme name, it is
|
||||
// interpreted as an absolute URL and we are done.
|
||||
if (url.scheme_ != null) {
|
||||
return url;
|
||||
}
|
||||
// c) Otherwise, the embedded URL inherits the scheme of
|
||||
// the base URL.
|
||||
url.scheme_ = baseUrl.scheme_;
|
||||
// Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
|
||||
// Step 7. Otherwise, the embedded URL inherits the <net_loc>
|
||||
// (if any) of the base URL.
|
||||
if (url.location_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.location_ = baseUrl.location_;
|
||||
// Step 4: If the embedded URL path is preceded by a slash "/", the
|
||||
// path is not relative and we skip to Step 7.
|
||||
if ((url.path_ != null)
|
||||
&& ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) {
|
||||
url.path_ = removeLeadingSlashPoints(url.path_);
|
||||
return url;
|
||||
}
|
||||
// Step 5: If the embedded URL path is empty (and not preceded by a
|
||||
// slash), then the embedded URL inherits the base URL path,
|
||||
// and
|
||||
if (url.path_ == null) {
|
||||
url.path_ = baseUrl.path_;
|
||||
// a) if the embedded URL's <params> is non-empty, we skip to
|
||||
// step 7; otherwise, it inherits the <params> of the base
|
||||
// URL (if any) and
|
||||
if (url.parameters_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.parameters_ = baseUrl.parameters_;
|
||||
// b) if the embedded URL's <query> is non-empty, we skip to
|
||||
// step 7; otherwise, it inherits the <query> of the base
|
||||
// URL (if any) and we skip to step 7.
|
||||
if (url.query_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.query_ = baseUrl.query_;
|
||||
return url;
|
||||
}
|
||||
// Step 6: The last segment of the base URL's path (anything
|
||||
// following the rightmost slash "/", or the entire path if no
|
||||
// slash is present) is removed and the embedded URL's path is
|
||||
// appended in its place. The following operations are
|
||||
// then applied, in order, to the new path:
|
||||
final String basePath = baseUrl.path_;
|
||||
String path = "";
|
||||
|
||||
if (basePath != null) {
|
||||
final int lastSlashIndex = basePath.lastIndexOf('/');
|
||||
|
||||
if (lastSlashIndex >= 0) {
|
||||
path = basePath.substring(0, lastSlashIndex + 1);
|
||||
}
|
||||
} else {
|
||||
path = "/";
|
||||
}
|
||||
path = path.concat(url.path_);
|
||||
// a) All occurrences of "./", where "." is a complete path
|
||||
// segment, are removed.
|
||||
int pathSegmentIndex;
|
||||
|
||||
while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
|
||||
path = path.substring(0, pathSegmentIndex + 1).concat(
|
||||
path.substring(pathSegmentIndex + 3));
|
||||
}
|
||||
// b) If the path ends with "." as a complete path segment,
|
||||
// that "." is removed.
|
||||
if (path.endsWith("/.")) {
|
||||
path = path.substring(0, path.length() - 1);
|
||||
}
|
||||
// c) All occurrences of "<segment>/../", where <segment> is a
|
||||
// complete path segment not equal to "..", are removed.
|
||||
// Removal of these path segments is performed iteratively,
|
||||
// removing the leftmost matching pattern on each iteration,
|
||||
// until no matching pattern remains.
|
||||
while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
|
||||
final String pathSegment = path.substring(0, pathSegmentIndex);
|
||||
final int slashIndex = pathSegment.lastIndexOf('/');
|
||||
|
||||
if (slashIndex < 0) {
|
||||
continue;
|
||||
}
|
||||
if (!"..".equals(pathSegment.substring(slashIndex))) {
|
||||
path = path.substring(0, slashIndex + 1).concat(
|
||||
path.substring(pathSegmentIndex + 4));
|
||||
}
|
||||
}
|
||||
// d) If the path ends with "<segment>/..", where <segment> is a
|
||||
// complete path segment not equal to "..", that
|
||||
// "<segment>/.." is removed.
|
||||
if (path.endsWith("/..")) {
|
||||
final String pathSegment = path.substring(0, path.length() - 3);
|
||||
final int slashIndex = pathSegment.lastIndexOf('/');
|
||||
|
||||
if (slashIndex >= 0) {
|
||||
path = path.substring(0, slashIndex + 1);
|
||||
}
|
||||
}
|
||||
|
||||
path = removeLeadingSlashPoints(path);
|
||||
|
||||
url.path_ = path;
|
||||
// Step 7: The resulting URL components, including any inherited from
|
||||
// the base URL, are recombined to give the absolute form of
|
||||
// the embedded URL.
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* "/.." at the beginning should be removed as browsers do (not in RFC)
|
||||
*/
|
||||
private static String removeLeadingSlashPoints(String path) {
|
||||
while (path.startsWith("/..")) {
|
||||
path = path.substring(3);
|
||||
}
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class <tt>Url</tt> represents a Uniform Resource Locator.
|
||||
*
|
||||
* @author Martin Tamme
|
||||
*/
|
||||
private static class Url {
|
||||
|
||||
private String scheme_;
|
||||
|
||||
private String location_;
|
||||
|
||||
private String path_;
|
||||
|
||||
private String parameters_;
|
||||
|
||||
private String query_;
|
||||
|
||||
private String fragment_;
|
||||
|
||||
/**
|
||||
* Creates a <tt>Url</tt> object.
|
||||
*/
|
||||
public Url() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a <tt>Url</tt> object from the specified <tt>Url</tt> object.
|
||||
*
|
||||
* @param url
|
||||
* a <tt>Url</tt> object.
|
||||
*/
|
||||
public Url(final Url url) {
|
||||
scheme_ = url.scheme_;
|
||||
location_ = url.location_;
|
||||
path_ = url.path_;
|
||||
parameters_ = url.parameters_;
|
||||
query_ = url.query_;
|
||||
fragment_ = url.fragment_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string representation of the <tt>Url</tt> object.
|
||||
*
|
||||
* @return a string representation of the <tt>Url</tt> object.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (scheme_ != null) {
|
||||
sb.append(scheme_);
|
||||
sb.append(':');
|
||||
}
|
||||
if (location_ != null) {
|
||||
sb.append("//");
|
||||
sb.append(location_);
|
||||
}
|
||||
if (path_ != null) {
|
||||
sb.append(path_);
|
||||
}
|
||||
if (parameters_ != null) {
|
||||
sb.append(';');
|
||||
sb.append(parameters_);
|
||||
}
|
||||
if (query_ != null) {
|
||||
sb.append('?');
|
||||
sb.append(query_);
|
||||
}
|
||||
if (fragment_ != null) {
|
||||
sb.append('#');
|
||||
sb.append(fragment_);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,177 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.sleepycat.persist.model.Entity;
|
||||
import com.sleepycat.persist.model.PrimaryKey;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
|
||||
@Entity
|
||||
public class WebURL implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@PrimaryKey
|
||||
private String url;
|
||||
|
||||
private int docid;
|
||||
|
||||
private int parentDocid;
|
||||
|
||||
private String parentUrl;
|
||||
|
||||
private short depth;
|
||||
|
||||
private String domain;
|
||||
|
||||
private String subDomain;
|
||||
|
||||
private String path;
|
||||
|
||||
/**
|
||||
* Returns the unique document id assigned to this Url.
|
||||
*/
|
||||
public int getDocid() {
|
||||
return docid;
|
||||
}
|
||||
|
||||
public void setDocid(int docid) {
|
||||
this.docid = docid;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
WebURL otherUrl = (WebURL) o;
|
||||
return url != null && url.equals(otherUrl.getURL());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Url string
|
||||
*/
|
||||
public String getURL() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setURL(String url) {
|
||||
this.url = url;
|
||||
|
||||
int domainStartIdx = url.indexOf("//") + 2;
|
||||
int domainEndIdx = url.indexOf('/', domainStartIdx);
|
||||
domain = url.substring(domainStartIdx, domainEndIdx);
|
||||
subDomain = "";
|
||||
String[] parts = domain.split("\\.");
|
||||
if (parts.length > 2) {
|
||||
domain = parts[parts.length - 2] + "." + parts[parts.length - 1];
|
||||
int limit = 2;
|
||||
if (TLDList.contains(domain)) {
|
||||
domain = parts[parts.length - 3] + "." + domain;
|
||||
limit = 3;
|
||||
}
|
||||
for (int i = 0; i < parts.length - limit; i++) {
|
||||
if (subDomain.length() > 0) {
|
||||
subDomain += ".";
|
||||
}
|
||||
subDomain += parts[i];
|
||||
}
|
||||
}
|
||||
path = url.substring(domainEndIdx);
|
||||
int pathEndIdx = path.indexOf('?');
|
||||
if (pathEndIdx >= 0) {
|
||||
path = path.substring(0, pathEndIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the unique document id of the parent page. The parent page is the
|
||||
* page in which the Url of this page is first observed.
|
||||
*/
|
||||
public int getParentDocid() {
|
||||
return parentDocid;
|
||||
}
|
||||
|
||||
public void setParentDocid(int parentDocid) {
|
||||
this.parentDocid = parentDocid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the url of the parent page. The parent page is the page in which
|
||||
* the Url of this page is first observed.
|
||||
*/
|
||||
public String getParentUrl() {
|
||||
return parentUrl;
|
||||
}
|
||||
|
||||
public void setParentUrl(String parentUrl) {
|
||||
this.parentUrl = parentUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the crawl depth at which this Url is first observed. Seed Urls
|
||||
* are at depth 0. Urls that are extracted from seed Urls are at depth 1,
|
||||
* etc.
|
||||
*/
|
||||
public short getDepth() {
|
||||
return depth;
|
||||
}
|
||||
|
||||
public void setDepth(short depth) {
|
||||
this.depth = depth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the domain of this Url. For 'http://www.example.com/sample.htm',
|
||||
* domain will be 'example.com'
|
||||
*/
|
||||
public String getDomain() {
|
||||
return domain;
|
||||
}
|
||||
|
||||
public String getSubDomain() {
|
||||
return subDomain;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the path of this Url. For 'http://www.example.com/sample.htm',
|
||||
* domain will be 'sample.htm'
|
||||
*/
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public void setPath(String path) {
|
||||
this.path = path;
|
||||
}
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.util;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class IO {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(IO.class.getName());
|
||||
|
||||
public static boolean deleteFolder(File folder) {
|
||||
return deleteFolderContents(folder) && folder.delete();
|
||||
}
|
||||
|
||||
public static boolean deleteFolderContents(File folder) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Deleting content of: " + folder.getAbsolutePath());
|
||||
}
|
||||
File[] files = folder.listFiles();
|
||||
for (File file : files) {
|
||||
if (file.isFile()) {
|
||||
if (!file.delete()) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!deleteFolder(file)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static void writeBytesToFile(byte[] bytes, String destination) {
|
||||
try {
|
||||
FileChannel fc = new FileOutputStream(destination).getChannel();
|
||||
fc.write(ByteBuffer.wrap(bytes));
|
||||
fc.close();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,82 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.util;
|
||||
|
||||
/**
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class Util {
|
||||
|
||||
public static byte[] long2ByteArray(long l) {
|
||||
byte[] array = new byte[8];
|
||||
int i, shift;
|
||||
for (i = 0, shift = 56; i < 8; i++, shift -= 8) {
|
||||
array[i] = (byte) (0xFF & (l >> shift));
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
public static byte[] int2ByteArray(int value) {
|
||||
byte[] b = new byte[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int offset = (b.length - 1 - i) * 8;
|
||||
b[i] = (byte) ((value >>> offset) & 0xFF);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
public static int byteArray2Int(byte[] b) {
|
||||
int value = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int shift = (4 - 1 - i) * 8;
|
||||
value += (b[i] & 0x000000FF) << shift;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public static long byteArray2Long(byte[] b) {
|
||||
int value = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int shift = (8 - 1 - i) * 8;
|
||||
value += (b[i] & 0x000000FF) << shift;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public static boolean hasBinaryContent(String contentType) {
|
||||
if (contentType != null) {
|
||||
String typeStr = contentType.toLowerCase();
|
||||
if (typeStr.contains("image") || typeStr.contains("audio")
|
||||
|| typeStr.contains("video")
|
||||
|| typeStr.contains("application")) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean hasPlainTextContent(String contentType) {
|
||||
if (contentType != null) {
|
||||
String typeStr = contentType.toLowerCase();
|
||||
if (typeStr.contains("text/plain")) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,8 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry exported="true" kind="lib" path="imageio-ext-tiff-1.0.8.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="imageio-ext-utilities-1.0.8.jar"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>it.geosolutions</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,7 +0,0 @@
|
|||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,12 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Geosolutions
|
||||
Bundle-SymbolicName: it.geosolutions
|
||||
Bundle-Version: 1.0.8
|
||||
Bundle-ClassPath: imageio-ext-tiff-1.0.8.jar,
|
||||
imageio-ext-utilities-1.0.8.jar
|
||||
Export-Package: it.geosolutions.imageio.plugins.tiff,
|
||||
it.geosolutions.imageio.utilities,
|
||||
it.geosolutions.imageioimpl.plugins.tiff
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
||||
Eclipse-BuddyPolicy: registered, ext, global
|
|
@ -1,3 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
imageio-ext-tiff-1.0.8.jar,\
|
||||
imageio-ext-utilities-1.0.8.jar
|
Binary file not shown.
Binary file not shown.
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry exported="true" kind="lib" path="activation.jar" sourcepath="javax.activationsrc.zip"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>javax.activation</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,7 +0,0 @@
|
|||
#Thu Mar 26 11:17:44 CDT 2009
|
||||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,10 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Activation Plug-in
|
||||
Bundle-SymbolicName: javax.activation
|
||||
Bundle-Version: 1.0.0.qualifier
|
||||
Bundle-ClassPath: activation.jar
|
||||
Export-Package: com.sun.activation.registries,
|
||||
com.sun.activation.viewers,
|
||||
javax.activation
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
activation.jar
|
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry exported="true" kind="lib" path="mail.jar" sourcepath="javax.mailsrc.zip"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>javax.mail</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,7 +0,0 @@
|
|||
#Thu Mar 26 11:18:00 CDT 2009
|
||||
eclipse.preferences.version=1
|
||||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
|
||||
org.eclipse.jdt.core.compiler.source=1.6
|
|
@ -1,19 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: Mail Plug-in
|
||||
Bundle-SymbolicName: javax.mail
|
||||
Bundle-Version: 1.0.0.qualifier
|
||||
Bundle-ClassPath: mail.jar,
|
||||
.
|
||||
Export-Package: com.sun.mail.handlers,
|
||||
com.sun.mail.iap,
|
||||
com.sun.mail.imap,
|
||||
com.sun.mail.imap.protocol,
|
||||
com.sun.mail.pop3,
|
||||
com.sun.mail.smtp,
|
||||
com.sun.mail.util,
|
||||
javax.mail,
|
||||
javax.mail.event,
|
||||
javax.mail.internet,
|
||||
javax.mail.search
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
|
@ -1,3 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
.,\
|
||||
mail.jar
|
Binary file not shown.
|
@ -1,22 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>javax.media.opengl.win64</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.ManifestBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.pde.SchemaBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.pde.PluginNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -1,8 +0,0 @@
|
|||
Manifest-Version: 1.0
|
||||
Bundle-ManifestVersion: 2
|
||||
Bundle-Name: JOGL Win64 Specific Fragment
|
||||
Bundle-SymbolicName: javax.media.opengl.win64
|
||||
Bundle-Version: 1.14.0.qualifier
|
||||
Fragment-Host: javax.media.opengl;bundle-version="1.1.1"
|
||||
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
|
||||
Eclipse-PlatformFilter: (& (osgi.os=win32) (osgi.arch=x86_64))
|
|
@ -1,5 +0,0 @@
|
|||
bin.includes = META-INF/,\
|
||||
gluegen-rt.dll,\
|
||||
jogl.dll,\
|
||||
jogl_awt.dll,\
|
||||
jogl_cg.dll
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,6 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
|
||||
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue