Omaha #3210 - moving foss from cots/ into AWIPS2_foss repository

Former-commit-id: 9396273e49 [formerly 24de47720f [formerly 762b2ad4da3ab2944b0df17ce655bcc60d02bdc3]]
Former-commit-id: 24de47720f
Former-commit-id: 119df754b6
This commit is contained in:
Steve Harris 2014-05-29 12:00:50 -05:00
parent 7c364fa77c
commit be5b5215f5
589 changed files with 0 additions and 65548 deletions

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry exported="true" kind="lib" path="jna-4.1.0.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>com.sun.jna</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,7 +0,0 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,10 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: JNA FOSS
Bundle-SymbolicName: com.sun.jna
Bundle-Version: 4.1.0
Bundle-ClassPath: jna-4.1.0.jar
Export-Package: com.sun.jna,
com.sun.jna.ptr,
com.sun.jna.win32
Bundle-RequiredExecutionEnvironment: JavaSE-1.6

View file

@ -1,2 +0,0 @@
bin.includes = META-INF/,\
jna-4.1.0.jar

Binary file not shown.

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry exported="true" kind="lib" path="jaxb-impl-2.1.9.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>com.sun.xml.bind</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,8 +0,0 @@
#Fri Jun 08 12:02:53 CDT 2012
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,37 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Bind
Bundle-SymbolicName: com.sun.xml.bind
Bundle-Version: 1.0.0.qualifier
Bundle-ClassPath: jaxb-impl-2.1.9.jar
Bundle-Vendor: SUN
Export-Package: com.sun.istack,
com.sun.istack.localization,
com.sun.xml.bind,
com.sun.xml.bind.annotation,
com.sun.xml.bind.api,
com.sun.xml.bind.api.impl,
com.sun.xml.bind.marshaller,
com.sun.xml.bind.unmarshaller,
com.sun.xml.bind.util,
com.sun.xml.bind.v2,
com.sun.xml.bind.v2.bytecode,
com.sun.xml.bind.v2.model.annotation,
com.sun.xml.bind.v2.model.core,
com.sun.xml.bind.v2.model.impl,
com.sun.xml.bind.v2.model.nav,
com.sun.xml.bind.v2.model.runtime,
com.sun.xml.bind.v2.runtime,
com.sun.xml.bind.v2.runtime.output,
com.sun.xml.bind.v2.runtime.property,
com.sun.xml.bind.v2.runtime.reflect,
com.sun.xml.bind.v2.runtime.reflect.opt,
com.sun.xml.bind.v2.runtime.unmarshaller,
com.sun.xml.bind.v2.schemagen,
com.sun.xml.bind.v2.schemagen.episode,
com.sun.xml.bind.v2.schemagen.xmlschema,
com.sun.xml.bind.v2.util,
com.sun.xml.txw2,
com.sun.xml.txw2.annotation,
com.sun.xml.txw2.output
Bundle-RequiredExecutionEnvironment: JavaSE-1.6

View file

@ -1,3 +0,0 @@
bin.includes = META-INF/,\
.,\
jaxb-impl-2.1.9.jar

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry exported="true" kind="lib" path="JavaAPIforKml.jar" sourcepath="JavaAPIforKml-sources.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>de.micromata.opengis.kml</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,8 +0,0 @@
#Wed May 30 18:56:22 CDT 2012
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,13 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Kml
Bundle-SymbolicName: de.micromata.opengis.kml
Bundle-Version: 1.0.0.qualifier
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
Bundle-ClassPath: JavaAPIforKml.jar,
.
Export-Package: de.micromata.opengis.kml.v_2_2_0,
de.micromata.opengis.kml.v_2_2_0.annotations,
de.micromata.opengis.kml.v_2_2_0.atom,
de.micromata.opengis.kml.v_2_2_0.gx,
de.micromata.opengis.kml.v_2_2_0.xal

View file

@ -1,3 +0,0 @@
bin.includes = META-INF/,\
.,\
JavaAPIforKml.jar

View file

@ -1,17 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry exported="true" kind="lib" path="apache-mime4j-core-0.7.jar"/>
<classpathentry exported="true" kind="lib" path="apache-mime4j-dom-0.7.jar"/>
<classpathentry exported="true" kind="lib" path="asm-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="boilerpipe-1.1.0.jar"/>
<classpathentry exported="true" kind="lib" path="je-4.0.92.jar"/>
<classpathentry exported="true" kind="lib" path="metadata-extractor-2.4.0-beta-1.jar"/>
<classpathentry exported="true" kind="lib" path="tagsoup-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="tika-core-1.0.jar"/>
<classpathentry exported="true" kind="lib" path="tika-parsers-1.0.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry kind="src" path="src"/>
<classpathentry kind="src" path="resources"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>edu.uci.ics.crawler4j</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,8 +0,0 @@
#Mon Feb 20 17:18:28 CST 2012
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,162 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Crawler4j
Bundle-SymbolicName: edu.uci.ics.crawler4j
Bundle-Version: 1.0.0.qualifier
Bundle-ActivationPolicy: lazy
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
Bundle-ClassPath: apache-mime4j-core-0.7.jar,
apache-mime4j-dom-0.7.jar,
asm-3.1.jar,
boilerpipe-1.1.0.jar,
je-4.0.92.jar,
metadata-extractor-2.4.0-beta-1.jar,
tagsoup-1.2.1.jar,
tika-core-1.0.jar,
tika-parsers-1.0.jar,
.
Require-Bundle: org.apache.commons.codec;bundle-version="1.4.0",
org.apache.http;bundle-version="4.1.2",
org.apache.commons.compress;bundle-version="1.5.0"
Export-Package: com.drew.imaging,
com.drew.imaging.jpeg,
com.drew.imaging.tiff,
com.drew.lang,
com.drew.metadata,
com.drew.metadata.exif,
com.drew.metadata.iptc,
com.drew.metadata.jpeg,
com.sleepycat.asm,
com.sleepycat.bind,
com.sleepycat.bind.serial,
com.sleepycat.bind.tuple,
com.sleepycat.collections,
com.sleepycat.compat,
com.sleepycat.je,
com.sleepycat.je.cleaner,
com.sleepycat.je.config,
com.sleepycat.je.dbi,
com.sleepycat.je.evictor,
com.sleepycat.je.incomp,
com.sleepycat.je.jmx,
com.sleepycat.je.jmx.plugin,
com.sleepycat.je.latch,
com.sleepycat.je.log,
com.sleepycat.je.log.entry,
com.sleepycat.je.recovery,
com.sleepycat.je.rep,
com.sleepycat.je.rep.elections,
com.sleepycat.je.rep.impl,
com.sleepycat.je.rep.impl.networkRestore,
com.sleepycat.je.rep.impl.node,
com.sleepycat.je.rep.jmx,
com.sleepycat.je.rep.jmx.plugin,
com.sleepycat.je.rep.monitor,
com.sleepycat.je.rep.stream,
com.sleepycat.je.rep.txn,
com.sleepycat.je.rep.util,
com.sleepycat.je.rep.util.ldiff,
com.sleepycat.je.rep.utilint,
com.sleepycat.je.rep.vlsn,
com.sleepycat.je.tree,
com.sleepycat.je.txn,
com.sleepycat.je.util,
com.sleepycat.je.utilint,
com.sleepycat.persist,
com.sleepycat.persist.evolve,
com.sleepycat.persist.impl,
com.sleepycat.persist.model,
com.sleepycat.persist.raw,
com.sleepycat.util,
com.sleepycat.util.keyrange,
de.l3s.boilerpipe,
de.l3s.boilerpipe.conditions,
de.l3s.boilerpipe.document,
de.l3s.boilerpipe.estimators,
de.l3s.boilerpipe.extractors,
de.l3s.boilerpipe.filters.english,
de.l3s.boilerpipe.filters.heuristics,
de.l3s.boilerpipe.filters.simple,
de.l3s.boilerpipe.labels,
de.l3s.boilerpipe.sax,
de.l3s.boilerpipe.util,
edu.uci.ics.crawler4j.crawler,
edu.uci.ics.crawler4j.fetcher,
edu.uci.ics.crawler4j.frontier,
edu.uci.ics.crawler4j.parser,
edu.uci.ics.crawler4j.robotstxt,
edu.uci.ics.crawler4j.url,
edu.uci.ics.crawler4j.util,
org.apache.james.mime4j,
org.apache.james.mime4j.codec,
org.apache.james.mime4j.dom,
org.apache.james.mime4j.dom.address,
org.apache.james.mime4j.dom.datetime,
org.apache.james.mime4j.dom.field,
org.apache.james.mime4j.field,
org.apache.james.mime4j.field.address,
org.apache.james.mime4j.field.contentdisposition.parser,
org.apache.james.mime4j.field.contenttype.parser,
org.apache.james.mime4j.field.datetime.parser,
org.apache.james.mime4j.field.language.parser,
org.apache.james.mime4j.field.mimeversion.parser,
org.apache.james.mime4j.field.structured.parser,
org.apache.james.mime4j.io,
org.apache.james.mime4j.message,
org.apache.james.mime4j.parser,
org.apache.james.mime4j.stream,
org.apache.james.mime4j.util,
org.apache.tika,
org.apache.tika.config,
org.apache.tika.detect,
org.apache.tika.exception,
org.apache.tika.extractor,
org.apache.tika.fork,
org.apache.tika.io,
org.apache.tika.language,
org.apache.tika.metadata,
org.apache.tika.mime,
org.apache.tika.parser,
org.apache.tika.parser.asm,
org.apache.tika.parser.audio,
org.apache.tika.parser.chm,
org.apache.tika.parser.chm.accessor,
org.apache.tika.parser.chm.assertion,
org.apache.tika.parser.chm.core,
org.apache.tika.parser.chm.exception,
org.apache.tika.parser.chm.lzx,
org.apache.tika.parser.dwg,
org.apache.tika.parser.epub,
org.apache.tika.parser.feed,
org.apache.tika.parser.font,
org.apache.tika.parser.hdf,
org.apache.tika.parser.html,
org.apache.tika.parser.image,
org.apache.tika.parser.image.xmp,
org.apache.tika.parser.internal,
org.apache.tika.parser.iwork,
org.apache.tika.parser.jpeg,
org.apache.tika.parser.mail,
org.apache.tika.parser.mbox,
org.apache.tika.parser.microsoft,
org.apache.tika.parser.microsoft.ooxml,
org.apache.tika.parser.mp3,
org.apache.tika.parser.netcdf,
org.apache.tika.parser.odf,
org.apache.tika.parser.opendocument,
org.apache.tika.parser.pdf,
org.apache.tika.parser.pkg,
org.apache.tika.parser.prt,
org.apache.tika.parser.rtf,
org.apache.tika.parser.txt,
org.apache.tika.parser.video,
org.apache.tika.parser.xml,
org.apache.tika.sax,
org.apache.tika.sax.xpath,
org.apache.tika.utils,
org.ccil.cowan.tagsoup,
org.ccil.cowan.tagsoup.jaxp,
org.cyberneko.html,
org.objectweb.asm,
org.objectweb.asm.signature
Import-Package: org.apache.log4j

View file

@ -1,14 +0,0 @@
source.. = src/
output.. = bin/
bin.includes = META-INF/,\
.,\
apache-mime4j-core-0.7.jar,\
apache-mime4j-dom-0.7.jar,\
asm-3.1.jar,\
boilerpipe-1.1.0.jar,\
je-4.0.92.jar,\
metadata-extractor-2.4.0-beta-1.jar,\
tagsoup-1.2.1.jar,\
tika-core-1.0.jar,\
tika-parsers-1.0.jar,\
resources/

View file

@ -1,9 +0,0 @@
log4j.rootCategory=DEBUG, stdout
log4j.appender.stdout.Threshold=INFO
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] %m%n

File diff suppressed because it is too large Load diff

View file

@ -1,37 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.crawler;
/**
* Several core components of crawler4j extend this class to make them
* configurable.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public abstract class Configurable {
protected CrawlConfig config;
protected Configurable(CrawlConfig config) {
this.config = config;
}
public CrawlConfig getConfig() {
return config;
}
}

View file

@ -1,384 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.crawler;
public class CrawlConfig {
/**
* The folder which will be used by crawler for storing the intermediate
* crawl data. The content of this folder should not be modified manually.
*/
private String crawlStorageFolder;
/**
* If this feature is enabled, you would be able to resume a previously
* stopped/crashed crawl. However, it makes crawling slightly slower
*/
private boolean resumableCrawling = false;
/**
* Maximum depth of crawling For unlimited depth this parameter should be
* set to -1
*/
private int maxDepthOfCrawling = -1;
/**
* Maximum number of pages to fetch For unlimited number of pages, this
* parameter should be set to -1
*/
private int maxPagesToFetch = -1;
/**
* user-agent string that is used for representing your crawler to web
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
*/
private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";
/**
* Politeness delay in milliseconds (delay between sending two requests to
* the same host).
*/
private int politenessDelay = 200;
/**
* Should we also crawl https pages?
*/
private boolean includeHttpsPages = false;
/**
* Should we fetch binary content such as images, audio, ...?
*/
private boolean includeBinaryContentInCrawling = false;
/**
* Maximum Connections per host
*/
private int maxConnectionsPerHost = 100;
/**
* Maximum total connections
*/
private int maxTotalConnections = 100;
/**
* Socket timeout in milliseconds
*/
private int socketTimeout = 20000;
/**
* Connection timeout in milliseconds
*/
private int connectionTimeout = 30000;
/**
* Max number of outgoing links which are processed from a page
*/
private int maxOutgoingLinksToFollow = 5000;
/**
* Max allowed size of a page. Pages larger than this size will not be
* fetched.
*/
private int maxDownloadSize = 1048576;
/**
* Should we follow redirects?
*/
private boolean followRedirects = true;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy host.
*/
private String proxyHost = null;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy port.
*/
private int proxyPort = 80;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* username.
*/
private String proxyUsername = null;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* password.
*/
private String proxyPassword = null;
public CrawlConfig() {
}
/**
* Validates the configs specified by this instance.
*
* @throws Exception
*/
public void validate() throws Exception {
if (crawlStorageFolder == null) {
throw new Exception(
"Crawl storage folder is not set in the CrawlConfig.");
}
if (politenessDelay < 0) {
throw new Exception("Invalid value for politeness delay: "
+ politenessDelay);
}
if (maxDepthOfCrawling < -1) {
throw new Exception(
"Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
}
if (maxDepthOfCrawling > Short.MAX_VALUE) {
throw new Exception("Maximum value for crawl depth is "
+ Short.MAX_VALUE);
}
}
public String getCrawlStorageFolder() {
return crawlStorageFolder;
}
/**
* The folder which will be used by crawler for storing the intermediate
* crawl data. The content of this folder should not be modified manually.
*/
public void setCrawlStorageFolder(String crawlStorageFolder) {
this.crawlStorageFolder = crawlStorageFolder;
}
public boolean isResumableCrawling() {
return resumableCrawling;
}
/**
* If this feature is enabled, you would be able to resume a previously
* stopped/crashed crawl. However, it makes crawling slightly slower
*/
public void setResumableCrawling(boolean resumableCrawling) {
this.resumableCrawling = resumableCrawling;
}
public int getMaxDepthOfCrawling() {
return maxDepthOfCrawling;
}
/**
* Maximum depth of crawling For unlimited depth this parameter should be
* set to -1
*/
public void setMaxDepthOfCrawling(int maxDepthOfCrawling) {
this.maxDepthOfCrawling = maxDepthOfCrawling;
}
public int getMaxPagesToFetch() {
return maxPagesToFetch;
}
/**
* Maximum number of pages to fetch For unlimited number of pages, this
* parameter should be set to -1
*/
public void setMaxPagesToFetch(int maxPagesToFetch) {
this.maxPagesToFetch = maxPagesToFetch;
}
public String getUserAgentString() {
return userAgentString;
}
/**
* user-agent string that is used for representing your crawler to web
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
*/
public void setUserAgentString(String userAgentString) {
this.userAgentString = userAgentString;
}
public int getPolitenessDelay() {
return politenessDelay;
}
/**
* Politeness delay in milliseconds (delay between sending two requests to
* the same host).
*
* @param politenessDelay
* the delay in milliseconds.
*/
public void setPolitenessDelay(int politenessDelay) {
this.politenessDelay = politenessDelay;
}
public boolean isIncludeHttpsPages() {
return includeHttpsPages;
}
/**
* Should we also crawl https pages?
*/
public void setIncludeHttpsPages(boolean includeHttpsPages) {
this.includeHttpsPages = includeHttpsPages;
}
public boolean isIncludeBinaryContentInCrawling() {
return includeBinaryContentInCrawling;
}
/**
* Should we fetch binary content such as images, audio, ...?
*/
public void setIncludeBinaryContentInCrawling(
boolean includeBinaryContentInCrawling) {
this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
}
public int getMaxConnectionsPerHost() {
return maxConnectionsPerHost;
}
/**
* Maximum Connections per host
*/
public void setMaxConnectionsPerHost(int maxConnectionsPerHost) {
this.maxConnectionsPerHost = maxConnectionsPerHost;
}
public int getMaxTotalConnections() {
return maxTotalConnections;
}
/**
* Maximum total connections
*/
public void setMaxTotalConnections(int maxTotalConnections) {
this.maxTotalConnections = maxTotalConnections;
}
public int getSocketTimeout() {
return socketTimeout;
}
/**
* Socket timeout in milliseconds
*/
public void setSocketTimeout(int socketTimeout) {
this.socketTimeout = socketTimeout;
}
public int getConnectionTimeout() {
return connectionTimeout;
}
/**
* Connection timeout in milliseconds
*/
public void setConnectionTimeout(int connectionTimeout) {
this.connectionTimeout = connectionTimeout;
}
public int getMaxOutgoingLinksToFollow() {
return maxOutgoingLinksToFollow;
}
/**
* Max number of outgoing links which are processed from a page
*/
public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) {
this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow;
}
public int getMaxDownloadSize() {
return maxDownloadSize;
}
/**
* Max allowed size of a page. Pages larger than this size will not be
* fetched.
*/
public void setMaxDownloadSize(int maxDownloadSize) {
this.maxDownloadSize = maxDownloadSize;
}
public boolean isFollowRedirects() {
return followRedirects;
}
/**
* Should we follow redirects?
*/
public void setFollowRedirects(boolean followRedirects) {
this.followRedirects = followRedirects;
}
public String getProxyHost() {
return proxyHost;
}
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy host.
*/
public void setProxyHost(String proxyHost) {
this.proxyHost = proxyHost;
}
public int getProxyPort() {
return proxyPort;
}
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy port.
*/
public void setProxyPort(int proxyPort) {
this.proxyPort = proxyPort;
}
public String getProxyUsername() {
return proxyUsername;
}
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* username.
*/
public void setProxyUsername(String proxyUsername) {
this.proxyUsername = proxyUsername;
}
public String getProxyPassword() {
return proxyPassword;
}
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* password.
*/
public void setProxyPassword(String proxyPassword) {
this.proxyPassword = proxyPassword;
}
}

View file

@ -1,462 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.crawler;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.IO;
/**
* The controller that manages a crawling session. This class creates the
* crawler threads and monitors their progress.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class CrawlController extends Configurable {
private static final Logger logger = Logger.getLogger(CrawlController.class
.getName());
/**
* The 'customData' object can be used for passing custom crawl-related
* configurations to different components of the crawler.
*/
protected Object customData;
/**
* Once the crawling session finishes the controller collects the local data
* of the crawler threads and stores them in this List.
*/
protected List<Object> crawlersLocalData = new ArrayList<Object>();
/**
* Is the crawling of this session finished?
*/
protected boolean finished;
/**
* Is the crawling session set to 'shutdown'. Crawler threads monitor this
* flag and when it is set they will no longer process new pages.
*/
protected boolean shuttingDown;
protected PageFetcher pageFetcher;
protected RobotstxtServer robotstxtServer;
protected Frontier frontier;
protected DocIDServer docIdServer;
protected final Object waitingLock = new Object();
public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
RobotstxtServer robotstxtServer) throws Exception {
super(config);
config.validate();
File folder = new File(config.getCrawlStorageFolder());
if (!folder.exists()) {
if (!folder.mkdirs()) {
throw new Exception("Couldn't create this folder: "
+ folder.getAbsolutePath());
}
}
boolean resumable = config.isResumableCrawling();
EnvironmentConfig envConfig = new EnvironmentConfig();
envConfig.setAllowCreate(true);
envConfig.setTransactional(resumable);
envConfig.setLocking(resumable);
File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
if (!envHome.exists()) {
if (!envHome.mkdir()) {
throw new Exception("Couldn't create this folder: "
+ envHome.getAbsolutePath());
}
}
if (!resumable) {
IO.deleteFolderContents(envHome);
}
Environment env = new Environment(envHome, envConfig);
docIdServer = new DocIDServer(env, config);
frontier = new Frontier(env, config, docIdServer);
this.pageFetcher = pageFetcher;
this.robotstxtServer = robotstxtServer;
finished = false;
shuttingDown = false;
}
/**
* Start the crawling session and wait for it to finish.
*
* @param _c
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
*/
public <T extends WebCrawler> void start(final ArrayList<T> webCrawlers,
final int numberOfCrawlers) {
this.start(webCrawlers, numberOfCrawlers, true);
}
/**
* Start the crawling session and return immediately.
*
* @param _c
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
*/
public <T extends WebCrawler> void startNonBlocking(
ArrayList<T> webCrawlers, final int numberOfCrawlers) {
this.start(webCrawlers, numberOfCrawlers, false);
}
protected <T extends WebCrawler> void start(final ArrayList<T> crawlers,
final int numberOfCrawlers, boolean isBlocking) {
try {
finished = false;
crawlersLocalData.clear();
final List<Thread> threads = new ArrayList<Thread>();
for (int i = 1; i <= numberOfCrawlers; i++) {
T crawler = crawlers.get(i - 1);
Thread thread = new Thread(crawler, "Crawler " + i);
crawler.setThread(thread);
crawler.init(i, this);
thread.start();
threads.add(thread);
// System.out.println("Crawler " + i + " started.");
}
final CrawlController controller = this;
Thread monitorThread = new Thread(new Runnable() {
@Override
public void run() {
try {
synchronized (waitingLock) {
while (true) {
sleep();
boolean someoneIsWorking = false;
for (int i = 0; i < threads.size(); i++) {
Thread thread = threads.get(i);
if (!thread.isAlive()) {
if (!shuttingDown) {
logger.info("Thread " + i
+ " has died.");
// T crawler = _c.newInstance();
// thread = new Thread(crawler,
// "Crawler " + (i + 1));
threads.remove(i);
// threads.add(i, thread);
// crawler.setThread(thread);
// crawler.init(i + 1, controller);
// thread.start();
crawlers.remove(i);
// crawlers.add(i, crawler);
}
} else if (crawlers.get(i)
.isNotWaitingForNewURLs()) {
someoneIsWorking = true;
}
}
if (!someoneIsWorking) {
// Make sure again that none of the threads
// are
// alive.
// System.out
// .println("It looks like no threads are working...");
someoneIsWorking = false;
for (int i = 0; i < threads.size(); i++) {
Thread thread = threads.get(i);
if (thread.isAlive()
&& crawlers
.get(i)
.isNotWaitingForNewURLs()) {
someoneIsWorking = true;
}
}
if (!someoneIsWorking) {
if (!shuttingDown) {
long queueLength = frontier
.getQueueLength();
if (queueLength > 0) {
continue;
}
// System.out
// .println("No thread is working and no more URLs are in queue ...");
queueLength = frontier
.getQueueLength();
if (queueLength > 0) {
continue;
}
}
logger.info("All of the crawlers are stopped. Finishing the process...");
// At this step, frontier notifies the
// threads that were
// waiting for new URLs and they should
// stop
frontier.finish();
for (T crawler : crawlers) {
crawler.onBeforeExit();
crawlersLocalData.add(crawler
.getMyLocalData());
}
// System.out.println("Final clean up...");
frontier.close();
docIdServer.close();
pageFetcher.shutDown();
finished = true;
waitingLock.notifyAll();
return;
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
});
monitorThread.start();
if (isBlocking) {
waitUntilFinish();
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Wait until this crawling session finishes.
*/
public void waitUntilFinish() {
while (!finished) {
synchronized (waitingLock) {
if (finished) {
return;
}
try {
waitingLock.wait();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
/**
* Once the crawling session finishes the controller collects the local data
* of the crawler threads and stores them in a List. This function returns
* the reference to this list.
*/
public List<Object> getCrawlersLocalData() {
return crawlersLocalData;
}
protected void sleep() {
try {
Thread.sleep(500);
// System.out.println("Sleeping!!!!!!");
} catch (Exception ignored) {
}
}
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling.
*
* @param pageUrl
* the URL of the seed
*/
public void addSeed(String pageUrl) {
addSeed(pageUrl, -1);
}
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* @param pageUrl
* the URL of the seed
* @param docId
* the document id that you want to be assigned to this seed URL.
*
*/
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
} else {
frontier.schedule(webUrl);
}
}
/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
*
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param pageUrl
* the URL of the page
* @param docId
* the document id that you want to be assigned to this URL.
*
*/
public void addSeenUrl(String url, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
if (canonicalUrl == null) {
logger.error("Invalid Url: " + url);
return;
}
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seen url: " + e.getMessage());
}
}
public PageFetcher getPageFetcher() {
return pageFetcher;
}
public void setPageFetcher(PageFetcher pageFetcher) {
this.pageFetcher = pageFetcher;
}
public RobotstxtServer getRobotstxtServer() {
return robotstxtServer;
}
public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
this.robotstxtServer = robotstxtServer;
}
public Frontier getFrontier() {
return frontier;
}
public void setFrontier(Frontier frontier) {
this.frontier = frontier;
}
public DocIDServer getDocIdServer() {
return docIdServer;
}
public void setDocIdServer(DocIDServer docIdServer) {
this.docIdServer = docIdServer;
}
public Object getCustomData() {
return customData;
}
public void setCustomData(Object customData) {
this.customData = customData;
}
public boolean isFinished() {
return this.finished;
}
public boolean isShuttingDown() {
return shuttingDown;
}
/**
* Set the current crawling session set to 'shutdown'. Crawler threads
* monitor the shutdown flag and when it is set to true, they will no longer
* process new pages.
*/
public void Shutdown() {
logger.info("Shutting down...");
this.shuttingDown = true;
frontier.finish();
}
}

View file

@ -1,155 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.crawler;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.util.EntityUtils;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* This class contains the data for a fetched and parsed page.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Page {
/**
* The URL of this page.
*/
protected WebURL url;
/**
* The content of this page in binary format.
*/
protected byte[] contentData;
/**
* The ContentType of this page. For example: "text/html; charset=UTF-8"
*/
protected String contentType;
/**
* The encoding of the content. For example: "gzip"
*/
protected String contentEncoding;
/**
* The charset of the content. For example: "UTF-8"
*/
protected String contentCharset;
/**
* The parsed data populated by parsers
*/
protected ParseData parseData;
public Page(WebURL url) {
this.url = url;
}
public WebURL getWebURL() {
return url;
}
public void setWebURL(WebURL url) {
this.url = url;
}
/**
* Loads the content of this page from a fetched HttpEntity.
*/
public void load(HttpEntity entity) throws Exception {
contentType = null;
Header type = entity.getContentType();
if (type != null) {
contentType = type.getValue();
}
contentEncoding = null;
Header encoding = entity.getContentEncoding();
if (encoding != null) {
contentEncoding = encoding.getValue();
}
contentCharset = EntityUtils.getContentCharSet(entity);
contentData = EntityUtils.toByteArray(entity);
}
/**
* Returns the parsed data generated for this page by parsers
*/
public ParseData getParseData() {
return parseData;
}
public void setParseData(ParseData parseData) {
this.parseData = parseData;
}
/**
* Returns the content of this page in binary format.
*/
public byte[] getContentData() {
return contentData;
}
public void setContentData(byte[] contentData) {
this.contentData = contentData;
}
/**
* Returns the ContentType of this page. For example:
* "text/html; charset=UTF-8"
*/
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
/**
* Returns the encoding of the content. For example: "gzip"
*/
public String getContentEncoding() {
return contentEncoding;
}
public void setContentEncoding(String contentEncoding) {
this.contentEncoding = contentEncoding;
}
/**
* Returns the charset of the content. For example: "UTF-8"
*/
public String getContentCharset() {
return contentCharset;
}
public void setContentCharset(String contentCharset) {
this.contentCharset = contentCharset;
}
}

View file

@ -1,347 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.crawler;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpStatus;
import org.apache.log4j.Logger;
import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* WebCrawler class in the Runnable class that is executed by each crawler
* thread.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class WebCrawler implements Runnable {
protected static final Logger logger = Logger.getLogger(WebCrawler.class
.getName());
/**
* The id associated to the crawler thread running this instance
*/
protected int myId;
/**
* The controller instance that has created this crawler thread. This
* reference to the controller can be used for getting configurations of the
* current crawl or adding new seeds during runtime.
*/
protected CrawlController myController;
/**
* The thread within which this crawler instance is running.
*/
private Thread myThread;
/**
* The parser that is used by this crawler instance to parse the content of
* the fetched pages.
*/
private Parser parser;
/**
* The fetcher that is used by this crawler instance to fetch the content of
* pages from the web.
*/
private PageFetcher pageFetcher;
/**
* The RobotstxtServer instance that is used by this crawler instance to
* determine whether the crawler is allowed to crawl the content of each
* page.
*/
private RobotstxtServer robotstxtServer;
/**
* The DocIDServer that is used by this crawler instance to map each URL to
* a unique docid.
*/
private DocIDServer docIdServer;
/**
* The Frontier object that manages the crawl queue.
*/
private Frontier frontier;
/**
* Is the current crawler instance waiting for new URLs? This field is
* mainly used by the controller to detect whether all of the crawler
* instances are waiting for new URLs and therefore there is no more work
* and crawling can be stopped.
*/
private boolean isWaitingForNewURLs;
public CrawlController getMyController() {
return myController;
}
/**
* Get the id of the current crawler instance
*
* @return the id of the current crawler instance
*/
public int getMyId() {
return myId;
}
/**
* The CrawlController instance that has created this crawler instance will
* call this function just before terminating this crawler thread. Classes
* that extend WebCrawler can override this function to pass their local
* data to their controller. The controller then puts these local data in a
* List that can then be used for processing the local data of crawlers (if
* needed).
*/
public Object getMyLocalData() {
return null;
}
public Thread getThread() {
return myThread;
}
/**
* This function is called once the header of a page is fetched. It can be
* overwritten by sub-classes to perform custom logic for different status
* codes. For example, 404 pages can be logged, etc.
*/
protected void handlePageStatusCode(WebURL webUrl, int statusCode,
String statusDescription) {
}
/**
* Initializes the current instance of the crawler
*
* @param myId
* the id of this crawler instance
* @param crawlController
* the controller that manages this crawling session
*/
public void init(int myId, CrawlController crawlController) {
this.myId = myId;
pageFetcher = crawlController.getPageFetcher();
robotstxtServer = crawlController.getRobotstxtServer();
docIdServer = crawlController.getDocIdServer();
frontier = crawlController.getFrontier();
parser = new Parser(crawlController.getConfig());
myController = crawlController;
isWaitingForNewURLs = false;
}
public boolean isNotWaitingForNewURLs() {
return !isWaitingForNewURLs;
}
/**
* This function is called just before the termination of the current
* crawler instance. It can be used for persisting in-memory data or other
* finalization tasks.
*/
public void onBeforeExit() {
}
/**
* This function is called just before starting the crawl by this crawler
* instance. It can be used for setting up the data structures or
* initializations needed by this crawler instance.
*/
public void onStart() {
}
private void processPage(WebURL curURL) {
if (curURL == null) {
return;
}
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode,
CustomFetchStatus.getStatusDescription(statusCode));
if (statusCode != HttpStatus.SC_OK) {
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
|| statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
if (myController.getConfig().isFollowRedirects()) {
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
return;
}
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
// Redirect page is already seen
return;
} else {
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
if (shouldVisit(webURL)
&& robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer
.getNewDocID(movedToUrl));
frontier.schedule(webURL);
}
}
}
} else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
logger.info("Skipping a page which was bigger than max allowed size: "
+ curURL.getURL());
}
return;
}
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
// Redirect page is already seen
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult
.getFetchedUrl()));
}
Page page = new Page(curURL);
int docid = curURL.getDocid();
if (fetchResult.fetchContent(page)
&& parser.parse(page, curURL.getURL())) {
ParseData parseData = page.getParseData();
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
List<WebURL> toSchedule = new ArrayList<WebURL>();
int maxCrawlDepth = myController.getConfig()
.getMaxDepthOfCrawling();
for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
webURL.setParentDocid(docid);
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is
// visited. So, we set the depth to a negative
// number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if (maxCrawlDepth == -1
|| curURL.getDepth() < maxCrawlDepth) {
if (shouldVisit(webURL)
&& robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer
.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
}
}
}
}
frontier.scheduleAll(toSchedule);
}
visit(page);
}
} catch (Exception e) {
e.printStackTrace();
logger.error(e.getMessage() + ", while processing: "
+ curURL.getURL());
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
@Override
public void run() {
onStart();
while (true) {
List<WebURL> assignedURLs = new ArrayList<WebURL>(50);
isWaitingForNewURLs = true;
frontier.getNextURLs(50, assignedURLs);
isWaitingForNewURLs = false;
logger.info("assignedURLs size = " + assignedURLs.size());
if (assignedURLs.size() == 0) {
if (frontier.isFinished()) {
logger.info("Exiting because frontier is finished.");
return;
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
for (WebURL curURL : assignedURLs) {
if (curURL != null) {
processPage(curURL);
frontier.setProcessed(curURL);
}
if (myController.isShuttingDown()) {
logger.info("Exiting because of controller shutdown.");
return;
}
}
}
}
}
public void setThread(Thread myThread) {
this.myThread = myThread;
}
/**
* Classes that extends WebCrawler can overwrite this function to tell the
* crawler whether the given url should be crawled or not. The following
* implementation indicates that all urls should be included in the crawl.
*
* @param url
* the url which we are interested to know whether it should be
* included in the crawl or not.
* @return if the url should be included in the crawl it returns true,
* otherwise false is returned.
*/
public boolean shouldVisit(WebURL url) {
return true;
}
/**
* Classes that extends WebCrawler can overwrite this function to process
* the content of the fetched and parsed page.
*
* @param page
* the page object that is just fetched and parsed.
*/
public void visit(Page page) {
}
}

View file

@ -1,106 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"; you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.fetcher;
import org.apache.http.HttpStatus;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class CustomFetchStatus {
public static final int PageTooBig = 1001;
public static final int FatalTransportError = 1005;
public static final int UnknownError = 1006;
public static String getStatusDescription(int code) {
switch (code) {
case HttpStatus.SC_OK:
return "OK";
case HttpStatus.SC_CREATED:
return "Created";
case HttpStatus.SC_ACCEPTED:
return "Accepted";
case HttpStatus.SC_NO_CONTENT:
return "No Content";
case HttpStatus.SC_MOVED_PERMANENTLY:
return "Moved Permanently";
case HttpStatus.SC_MOVED_TEMPORARILY:
return "Moved Temporarily";
case HttpStatus.SC_NOT_MODIFIED:
return "Not Modified";
case HttpStatus.SC_BAD_REQUEST:
return "Bad Request";
case HttpStatus.SC_UNAUTHORIZED:
return "Unauthorized";
case HttpStatus.SC_FORBIDDEN:
return "Forbidden";
case HttpStatus.SC_NOT_FOUND:
return "Not Found";
case HttpStatus.SC_INTERNAL_SERVER_ERROR:
return "Internal Server Error";
case HttpStatus.SC_NOT_IMPLEMENTED:
return "Not Implemented";
case HttpStatus.SC_BAD_GATEWAY:
return "Bad Gateway";
case HttpStatus.SC_SERVICE_UNAVAILABLE:
return "Service Unavailable";
case HttpStatus.SC_CONTINUE:
return "Continue";
case HttpStatus.SC_TEMPORARY_REDIRECT:
return "Temporary Redirect";
case HttpStatus.SC_METHOD_NOT_ALLOWED:
return "Method Not Allowed";
case HttpStatus.SC_CONFLICT:
return "Conflict";
case HttpStatus.SC_PRECONDITION_FAILED:
return "Precondition Failed";
case HttpStatus.SC_REQUEST_TOO_LONG:
return "Request Too Long";
case HttpStatus.SC_REQUEST_URI_TOO_LONG:
return "Request-URI Too Long";
case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE:
return "Unsupported Media Type";
case HttpStatus.SC_MULTIPLE_CHOICES:
return "Multiple Choices";
case HttpStatus.SC_SEE_OTHER:
return "See Other";
case HttpStatus.SC_USE_PROXY:
return "Use Proxy";
case HttpStatus.SC_PAYMENT_REQUIRED:
return "Payment Required";
case HttpStatus.SC_NOT_ACCEPTABLE:
return "Not Acceptable";
case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
return "Proxy Authentication Required";
case HttpStatus.SC_REQUEST_TIMEOUT:
return "Request Timeout";
case PageTooBig:
return "Page size was too big";
case FatalTransportError:
return "Fatal transport error";
case UnknownError:
return "Unknown error";
default:
return "(" + code + ")";
}
}
}

View file

@ -1,60 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.fetcher;
import java.util.concurrent.TimeUnit;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
public class IdleConnectionMonitorThread extends Thread {
private final ThreadSafeClientConnManager connMgr;
private volatile boolean shutdown;
public IdleConnectionMonitorThread(ThreadSafeClientConnManager connMgr) {
super("Connection Manager");
this.connMgr = connMgr;
}
@Override
public void run() {
try {
while (!shutdown) {
synchronized (this) {
wait(5000);
// Close expired connections
connMgr.closeExpiredConnections();
// Optionally, close connections
// that have been idle longer than 30 sec
connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
}
}
} catch (InterruptedException ex) {
// terminate
}
}
public void shutdown() {
shutdown = true;
synchronized (this) {
notifyAll();
}
}
}

View file

@ -1,105 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.fetcher;
import java.io.EOFException;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import edu.uci.ics.crawler4j.crawler.Page;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class PageFetchResult {
protected static final Logger logger = Logger
.getLogger(PageFetchResult.class);
protected int statusCode;
protected HttpEntity entity = null;
protected String fetchedUrl = null;
protected String movedToUrl = null;
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public HttpEntity getEntity() {
return entity;
}
public void setEntity(HttpEntity entity) {
this.entity = entity;
}
public String getFetchedUrl() {
return fetchedUrl;
}
public void setFetchedUrl(String fetchedUrl) {
this.fetchedUrl = fetchedUrl;
}
public boolean fetchContent(Page page) {
try {
page.load(entity);
return true;
} catch (Exception e) {
logger.info("Exception while fetching content for: "
+ page.getWebURL().getURL() + " [" + e.getMessage() + "]");
}
return false;
}
public void discardContentIfNotConsumed() {
try {
if (entity != null) {
EntityUtils.consume(entity);
}
} catch (EOFException e) {
// We can ignore this exception. It can happen on compressed streams
// which are not
// repeatable
} catch (IOException e) {
// We can ignore this exception. It can happen if the stream is
// closed.
} catch (Exception e) {
e.printStackTrace();
}
}
public String getMovedToUrl() {
return movedToUrl;
}
public void setMovedToUrl(String movedToUrl) {
this.movedToUrl = movedToUrl;
}
}

View file

@ -1,294 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.fetcher;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HttpContext;
import org.apache.log4j.Logger;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class PageFetcher extends Configurable {
protected static final Logger logger = Logger.getLogger(PageFetcher.class);
protected ThreadSafeClientConnManager connectionManager;
protected DefaultHttpClient httpClient;
protected final Object mutex = new Object();
protected long lastFetchTime = 0;
protected IdleConnectionMonitorThread connectionMonitorThread = null;
public PageFetcher(CrawlConfig config) {
super(config);
HttpParams params = new BasicHttpParams();
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1);
paramsBean.setContentCharset("UTF-8");
paramsBean.setUseExpectContinue(false);
params.setParameter(CoreProtocolPNames.USER_AGENT,
config.getUserAgentString());
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT,
config.getSocketTimeout());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,
config.getConnectionTimeout());
params.setBooleanParameter("http.protocol.handle-redirects", false);
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory
.getSocketFactory()));
if (config.isIncludeHttpsPages()) {
schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory
.getSocketFactory()));
}
connectionManager = new ThreadSafeClientConnManager(schemeRegistry);
connectionManager.setMaxTotal(config.getMaxTotalConnections());
connectionManager.setDefaultMaxPerRoute(config
.getMaxConnectionsPerHost());
httpClient = new DefaultHttpClient(connectionManager, params);
if (config.getProxyHost() != null) {
if (config.getProxyUsername() != null) {
httpClient.getCredentialsProvider()
.setCredentials(
new AuthScope(config.getProxyHost(),
config.getProxyPort()),
new UsernamePasswordCredentials(config
.getProxyUsername(), config
.getProxyPassword()));
}
HttpHost proxy = new HttpHost(config.getProxyHost(),
config.getProxyPort());
httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
proxy);
}
httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
@Override
public void process(final HttpResponse response,
final HttpContext context) throws HttpException,
IOException {
HttpEntity entity = response.getEntity();
Header contentEncoding = entity.getContentEncoding();
if (contentEncoding != null) {
HeaderElement[] codecs = contentEncoding.getElements();
for (HeaderElement codec : codecs) {
if (codec.getName().equalsIgnoreCase("gzip")) {
response.setEntity(new GzipDecompressingEntity(
response.getEntity()));
return;
}
}
}
}
});
if (connectionMonitorThread == null) {
connectionMonitorThread = new IdleConnectionMonitorThread(
connectionManager);
}
connectionMonitorThread.start();
}
public PageFetchResult fetchHeader(WebURL webUrl) {
PageFetchResult fetchResult = new PageFetchResult();
String toFetchURL = webUrl.getURL();
HttpGet get = null;
try {
get = new HttpGet(toFetchURL);
synchronized (mutex) {
long now = (new Date()).getTime();
if (now - lastFetchTime < config.getPolitenessDelay()) {
Thread.sleep(config.getPolitenessDelay()
- (now - lastFetchTime));
}
lastFetchTime = (new Date()).getTime();
}
get.addHeader("Accept-Encoding", "gzip");
HttpResponse response = httpClient.execute(get);
fetchResult.setEntity(response.getEntity());
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
if (statusCode != HttpStatus.SC_NOT_FOUND) {
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
|| statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
Header header = response.getFirstHeader("Location");
if (header != null) {
String movedToUrl = header.getValue();
movedToUrl = URLCanonicalizer.getCanonicalURL(
movedToUrl, toFetchURL);
fetchResult.setMovedToUrl(movedToUrl);
}
fetchResult.setStatusCode(statusCode);
return fetchResult;
}
logger.info("Failed: "
+ response.getStatusLine().toString()
+ ", while fetching " + toFetchURL);
}
fetchResult.setStatusCode(response.getStatusLine()
.getStatusCode());
return fetchResult;
}
fetchResult.setFetchedUrl(toFetchURL);
String uri = get.getURI().toString();
if (!uri.equals(toFetchURL)) {
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
fetchResult.setFetchedUrl(uri);
}
}
if (fetchResult.getEntity() != null) {
long size = fetchResult.getEntity().getContentLength();
if (size == -1) {
Header length = response.getLastHeader("Content-Length");
if (length == null) {
length = response.getLastHeader("Content-length");
}
if (length != null) {
size = Integer.parseInt(length.getValue());
} else {
size = -1;
}
}
if (size > config.getMaxDownloadSize()) {
fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
return fetchResult;
}
fetchResult.setStatusCode(HttpStatus.SC_OK);
return fetchResult;
} else {
get.abort();
}
} catch (IOException e) {
logger.error("Fatal transport error: " + e.getMessage()
+ " while fetching " + toFetchURL + " (link found in doc #"
+ webUrl.getParentDocid() + ")");
fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
return fetchResult;
} catch (IllegalStateException e) {
// ignoring exceptions that occur because of not registering https
// and other schemes
} catch (Exception e) {
if (e.getMessage() == null) {
logger.error("Error while fetching " + webUrl.getURL());
} else {
logger.error(e.getMessage() + " while fetching "
+ webUrl.getURL());
}
} finally {
try {
if (fetchResult.getEntity() == null && get != null) {
get.abort();
}
} catch (Exception e) {
e.printStackTrace();
}
}
fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
return fetchResult;
}
public synchronized void shutDown() {
if (connectionMonitorThread != null) {
connectionManager.shutdown();
connectionMonitorThread.shutdown();
}
}
public HttpClient getHttpClient() {
return httpClient;
}
private static class GzipDecompressingEntity extends HttpEntityWrapper {
public GzipDecompressingEntity(final HttpEntity entity) {
super(entity);
}
@Override
public InputStream getContent() throws IOException,
IllegalStateException {
// the wrapped entity's getContent() decides about repeatability
InputStream wrappedin = wrappedEntity.getContent();
return new GZIPInputStream(wrappedin);
}
@Override
public long getContentLength() {
// length of ungzipped content is not known
return -1;
}
}
}

View file

@ -1,155 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import java.util.HashMap;
import java.util.Map;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Counters extends Configurable {
public class ReservedCounterNames {
public final static String SCHEDULED_PAGES = "Scheduled-Pages";
public final static String PROCESSED_PAGES = "Processed-Pages";
}
protected Database statisticsDB = null;
protected Environment env;
protected final Object mutex = new Object();
protected Map<String, Long> counterValues;
public Counters(Environment env, CrawlConfig config)
throws DatabaseException {
super(config);
this.env = env;
this.counterValues = new HashMap<String, Long>();
/*
* When crawling is set to be resumable, we have to keep the statistics
* in a transactional database to make sure they are not lost if crawler
* is crashed or terminated unexpectedly.
*/
if (config.isResumableCrawling()) {
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(true);
dbConfig.setDeferredWrite(false);
statisticsDB = env.openDatabase(null, "Statistics", dbConfig);
OperationStatus result;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry value = new DatabaseEntry();
Transaction tnx = env.beginTransaction(null, null);
Cursor cursor = statisticsDB.openCursor(tnx, null);
result = cursor.getFirst(key, value, null);
while (result == OperationStatus.SUCCESS) {
if (value.getData().length > 0) {
String name = new String(key.getData());
long counterValue = Util.byteArray2Long(value.getData());
counterValues.put(name, counterValue);
}
result = cursor.getNext(key, value, null);
}
cursor.close();
tnx.commit();
}
}
public long getValue(String name) {
synchronized (mutex) {
Long value = counterValues.get(name);
if (value == null) {
return 0;
}
return value;
}
}
public void setValue(String name, long value) {
synchronized (mutex) {
try {
counterValues.put(name, value);
if (statisticsDB != null) {
Transaction txn = env.beginTransaction(null, null);
statisticsDB.put(txn, new DatabaseEntry(name.getBytes()),
new DatabaseEntry(Util.long2ByteArray(value)));
txn.commit();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void increment(String name) {
increment(name, 1);
}
public void increment(String name, long addition) {
synchronized (mutex) {
long prevValue = getValue(name);
setValue(name, prevValue + addition);
}
}
public void sync() {
if (config.isResumableCrawling()) {
return;
}
if (statisticsDB == null) {
return;
}
try {
statisticsDB.sync();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
public void close() {
try {
if (statisticsDB != null) {
statisticsDB.close();
}
} catch (DatabaseException e) {
e.printStackTrace();
}
}
}

View file

@ -1,176 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import org.apache.log4j.Logger;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class DocIDServer extends Configurable {
protected static final Logger logger = Logger.getLogger(DocIDServer.class
.getName());
protected Database docIDsDB = null;
protected final Object mutex = new Object();
protected int lastDocID;
public DocIDServer(Environment env, CrawlConfig config)
throws DatabaseException {
super(config);
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(config.isResumableCrawling());
dbConfig.setDeferredWrite(!config.isResumableCrawling());
docIDsDB = env.openDatabase(null, "DocIDs", dbConfig);
if (config.isResumableCrawling()) {
int docCount = getDocCount();
if (docCount > 0) {
logger.info("Loaded " + docCount
+ " URLs that had been detected in previous crawl.");
lastDocID = docCount;
}
} else {
lastDocID = 0;
}
}
/**
* Returns the docid of an already seen url.
*
* @param url
* the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is
* returned.
*/
public int getDocId(String url) {
synchronized (mutex) {
if (docIDsDB == null) {
return -1;
}
OperationStatus result;
DatabaseEntry value = new DatabaseEntry();
try {
DatabaseEntry key = new DatabaseEntry(url.getBytes());
result = docIDsDB.get(null, key, value, null);
if (result == OperationStatus.SUCCESS
&& value.getData().length > 0) {
return Util.byteArray2Int(value.getData());
}
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
}
public int getNewDocID(String url) {
synchronized (mutex) {
try {
// Make sure that we have not already assigned a docid for this
// URL
int docid = getDocId(url);
if (docid > 0) {
return docid;
}
lastDocID++;
docIDsDB.put(null, new DatabaseEntry(url.getBytes()),
new DatabaseEntry(Util.int2ByteArray(lastDocID)));
return lastDocID;
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
}
public void addUrlAndDocId(String url, int docId) throws Exception {
synchronized (mutex) {
if (docId <= lastDocID) {
throw new Exception("Requested doc id: " + docId
+ " is not larger than: " + lastDocID);
}
// Make sure that we have not already assigned a docid for this URL
int prevDocid = getDocId(url);
if (prevDocid > 0) {
if (prevDocid == docId) {
return;
}
throw new Exception("Doc id: " + prevDocid
+ " is already assigned to URL: " + url);
}
docIDsDB.put(null, new DatabaseEntry(url.getBytes()),
new DatabaseEntry(Util.int2ByteArray(docId)));
lastDocID = docId;
}
}
public boolean isSeenBefore(String url) {
return getDocId(url) != -1;
}
public int getDocCount() {
try {
return (int) docIDsDB.count();
} catch (DatabaseException e) {
e.printStackTrace();
}
return -1;
}
public void sync() {
if (config.isResumableCrawling()) {
return;
}
if (docIDsDB == null) {
return;
}
try {
// docIDsDB.sync();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
public void close() {
try {
docIDsDB.close();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
}

View file

@ -1,216 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import java.util.List;
import org.apache.log4j.Logger;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.frontier.Counters.ReservedCounterNames;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Frontier extends Configurable {
protected static final Logger logger = Logger.getLogger(Frontier.class
.getName());
protected WorkQueues workQueues;
protected InProcessPagesDB inProcessPages;
protected final Object mutex = new Object();
protected final Object waitingList = new Object();
protected boolean isFinished = false;
protected long scheduledPages;
protected DocIDServer docIdServer;
protected Counters counters;
public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
super(config);
this.counters = new Counters(env, config);
this.docIdServer = docIdServer;
try {
workQueues = new WorkQueues(env, "PendingURLsDB",
config.isResumableCrawling());
if (config.isResumableCrawling()) {
scheduledPages = counters
.getValue(ReservedCounterNames.SCHEDULED_PAGES);
inProcessPages = new InProcessPagesDB(env);
long numPreviouslyInProcessPages = inProcessPages.getLength();
if (numPreviouslyInProcessPages > 0) {
logger.info("Rescheduling " + numPreviouslyInProcessPages
+ " URLs from previous crawl.");
scheduledPages -= numPreviouslyInProcessPages;
while (true) {
List<WebURL> urls = inProcessPages.get(100);
if (urls.size() == 0) {
break;
}
scheduleAll(urls);
inProcessPages.delete(urls.size());
}
}
} else {
inProcessPages = null;
scheduledPages = 0;
}
} catch (DatabaseException e) {
logger.error("Error while initializing the Frontier: "
+ e.getMessage());
workQueues = null;
}
}
public void scheduleAll(List<WebURL> urls) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
int newScheduledPage = 0;
for (WebURL url : urls) {
if (maxPagesToFetch > 0
&& (scheduledPages + newScheduledPage) >= maxPagesToFetch) {
break;
}
try {
workQueues.put(url);
newScheduledPage++;
} catch (DatabaseException e) {
logger.error("Error while puting the url in the work queue.");
}
}
if (newScheduledPage > 0) {
scheduledPages += newScheduledPage;
counters.increment(
Counters.ReservedCounterNames.SCHEDULED_PAGES,
newScheduledPage);
}
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}
public void schedule(WebURL url) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
try {
if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {
workQueues.put(url);
scheduledPages++;
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES);
}
} catch (DatabaseException e) {
logger.error("Error while puting the url in the work queue.");
}
}
}
public void getNextURLs(int max, List<WebURL> result) {
while (true) {
synchronized (mutex) {
if (isFinished) {
return;
}
try {
List<WebURL> curResults = workQueues.get(max);
workQueues.delete(curResults.size());
if (inProcessPages != null) {
for (WebURL curPage : curResults) {
inProcessPages.put(curPage);
}
}
result.addAll(curResults);
} catch (DatabaseException e) {
logger.error("Error while getting next urls: "
+ e.getMessage());
e.printStackTrace();
}
if (result.size() > 0) {
return;
}
}
try {
synchronized (waitingList) {
waitingList.wait();
}
} catch (InterruptedException ignored) {
}
if (isFinished) {
return;
}
}
}
public void setProcessed(WebURL webURL) {
counters.increment(ReservedCounterNames.PROCESSED_PAGES);
if (inProcessPages != null) {
if (!inProcessPages.removeURL(webURL)) {
logger.warn("Could not remove: " + webURL.getURL()
+ " from list of processed pages.");
}
}
}
public long getQueueLength() {
return workQueues.getLength();
}
public long getNumberOfAssignedPages() {
return inProcessPages.getLength();
}
public long getNumberOfProcessedPages() {
return counters.getValue(ReservedCounterNames.PROCESSED_PAGES);
}
public void sync() {
workQueues.sync();
docIdServer.sync();
counters.sync();
}
public boolean isFinished() {
return isFinished;
}
public void close() {
sync();
workQueues.close();
counters.close();
}
public void finish() {
isFinished = true;
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}

View file

@ -1,91 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import org.apache.log4j.Logger;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
/**
* This class maintains the list of pages which are assigned to crawlers but are
* not yet processed. It is used for resuming a previous crawl.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class InProcessPagesDB extends WorkQueues {
private static final Logger logger = Logger
.getLogger(InProcessPagesDB.class.getName());
public InProcessPagesDB(Environment env) throws DatabaseException {
super(env, "InProcessPagesDB", true);
long docCount = getLength();
if (docCount > 0) {
logger.info("Loaded " + docCount
+ " URLs that have been in process in the previous crawl.");
}
}
public boolean removeURL(WebURL webUrl) {
synchronized (mutex) {
try {
DatabaseEntry key = new DatabaseEntry(Util.int2ByteArray(webUrl
.getDocid()));
Cursor cursor = null;
OperationStatus result;
DatabaseEntry value = new DatabaseEntry();
Transaction txn = env.beginTransaction(null, null);
try {
cursor = urlsDB.openCursor(txn, null);
result = cursor.getSearchKey(key, value, null);
if (result == OperationStatus.SUCCESS) {
result = cursor.delete();
if (result == OperationStatus.SUCCESS) {
return true;
}
}
} catch (DatabaseException e) {
if (txn != null) {
txn.abort();
txn = null;
}
throw e;
} finally {
if (cursor != null) {
cursor.close();
}
if (txn != null) {
txn.commit();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
return false;
}
}

View file

@ -1,50 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class WebURLTupleBinding extends TupleBinding<WebURL> {
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
return webURL;
}
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
}
}

View file

@ -1,197 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import java.util.ArrayList;
import java.util.List;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class WorkQueues {
protected Database urlsDB = null;
protected Environment env;
protected boolean resumable;
protected WebURLTupleBinding webURLBinding;
protected final Object mutex = new Object();
public WorkQueues(Environment env, String dbName, boolean resumable)
throws DatabaseException {
this.env = env;
this.resumable = resumable;
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(resumable);
dbConfig.setDeferredWrite(!resumable);
urlsDB = env.openDatabase(null, dbName, dbConfig);
webURLBinding = new WebURLTupleBinding();
}
public List<WebURL> get(int max) throws DatabaseException {
synchronized (mutex) {
int matches = 0;
List<WebURL> results = new ArrayList<WebURL>(max);
Cursor cursor = null;
OperationStatus result;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry value = new DatabaseEntry();
Transaction txn;
if (resumable) {
txn = env.beginTransaction(null, null);
} else {
txn = null;
}
try {
cursor = urlsDB.openCursor(txn, null);
result = cursor.getFirst(key, value, null);
while (matches < max && result == OperationStatus.SUCCESS) {
if (value.getData().length > 0) {
results.add(webURLBinding.entryToObject(value));
matches++;
}
result = cursor.getNext(key, value, null);
}
} catch (DatabaseException e) {
if (txn != null) {
txn.abort();
txn = null;
}
throw e;
} finally {
if (cursor != null) {
cursor.close();
}
if (txn != null) {
txn.commit();
}
}
return results;
}
}
public void delete(int count) throws DatabaseException {
synchronized (mutex) {
int matches = 0;
Cursor cursor = null;
OperationStatus result;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry value = new DatabaseEntry();
Transaction txn;
if (resumable) {
txn = env.beginTransaction(null, null);
} else {
txn = null;
}
try {
cursor = urlsDB.openCursor(txn, null);
result = cursor.getFirst(key, value, null);
while (matches < count && result == OperationStatus.SUCCESS) {
cursor.delete();
matches++;
result = cursor.getNext(key, value, null);
}
} catch (DatabaseException e) {
if (txn != null) {
txn.abort();
txn = null;
}
throw e;
} finally {
if (cursor != null) {
cursor.close();
}
if (txn != null) {
txn.commit();
}
}
}
}
public void put(WebURL url) throws DatabaseException {
byte[] keyData = Util.int2ByteArray(url.getDocid());
DatabaseEntry value = new DatabaseEntry();
webURLBinding.objectToEntry(url, value);
Transaction txn;
if (resumable) {
txn = env.beginTransaction(null, null);
} else {
txn = null;
}
urlsDB.put(txn, new DatabaseEntry(keyData), value);
if (resumable) {
if (txn != null) {
txn.commit();
}
}
}
public long getLength() {
try {
return urlsDB.count();
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
public void sync() {
// System.out.println("Syncing Sleepy Cat DB");
if (resumable) {
return;
}
if (urlsDB == null) {
return;
}
try {
urlsDB.sync();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
public void close() {
try {
urlsDB.close();
// System.out.println("Closing the Sleepy Cat DB");
} catch (DatabaseException e) {
e.printStackTrace();
}
}
}

View file

@ -1,34 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
import edu.uci.ics.crawler4j.parser.ParseData;
public class BinaryParseData implements ParseData {
private static BinaryParseData instance = new BinaryParseData();
public static BinaryParseData getInstance() {
return instance;
}
@Override
public String toString() {
return "[Binary parse data can not be dumped as string]";
}
}

View file

@ -1,166 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class HtmlContentHandler extends DefaultHandler {
private enum Element {
A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY
}
private static class HtmlFactory {
private static Map<String, Element> name2Element;
static {
name2Element = new HashMap<String, Element>();
for (Element element : Element.values()) {
name2Element.put(element.toString().toLowerCase(), element);
}
}
public static Element getElement(String name) {
return name2Element.get(name);
}
}
private String base;
private String metaRefresh;
private String metaLocation;
private boolean isWithinBodyElement;
private StringBuilder bodyText;
private Set<String> outgoingUrls;
public HtmlContentHandler() {
isWithinBodyElement = false;
bodyText = new StringBuilder();
outgoingUrls = new HashSet<String>();
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
Element element = HtmlFactory.getElement(localName);
if (element == Element.A || element == Element.AREA
|| element == Element.LINK) {
String href = attributes.getValue("href");
if (href != null) {
outgoingUrls.add(href);
}
return;
}
if (element == Element.IMG) {
String imgSrc = attributes.getValue("src");
if (imgSrc != null) {
outgoingUrls.add(imgSrc);
}
return;
}
if (element == Element.IFRAME || element == Element.FRAME
|| element == Element.EMBED) {
String src = attributes.getValue("src");
if (src != null) {
outgoingUrls.add(src);
}
return;
}
if (element == Element.BASE) {
if (base != null) { // We only consider the first occurrence of the
// Base element.
String href = attributes.getValue("href");
if (href != null) {
base = href;
}
}
return;
}
if (element == Element.META) {
String equiv = attributes.getValue("http-equiv");
String content = attributes.getValue("content");
if (equiv != null && content != null) {
equiv = equiv.toLowerCase();
// http-equiv="refresh" content="0;URL=http://foo.bar/..."
if (equiv.equals("refresh") && (metaRefresh == null)) {
int pos = content.toLowerCase().indexOf("url=");
if (pos != -1) {
metaRefresh = content.substring(pos + 4);
}
}
// http-equiv="location" content="http://foo.bar/..."
if (equiv.equals("location") && (metaLocation == null)) {
metaLocation = content;
}
}
return;
}
if (element == Element.BODY) {
isWithinBodyElement = true;
}
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
Element element = HtmlFactory.getElement(localName);
if (element == Element.BODY) {
isWithinBodyElement = false;
}
}
@Override
public void characters(char ch[], int start, int length)
throws SAXException {
if (isWithinBodyElement) {
bodyText.append(ch, start, length);
}
}
public String getBodyText() {
return bodyText.toString();
}
public Set<String> getOutgoingUrls() {
return outgoingUrls;
}
public String getBaseUrl() {
return base;
}
}

View file

@ -1,71 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
import java.util.List;
import edu.uci.ics.crawler4j.url.WebURL;
public class HtmlParseData implements ParseData {
private String html;
private String text;
private String title;
private List<WebURL> outgoingUrls;
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public List<WebURL> getOutgoingUrls() {
return outgoingUrls;
}
public void setOutgoingUrls(List<WebURL> outgoingUrls) {
this.outgoingUrls = outgoingUrls;
}
@Override
public String toString() {
return text;
}
}

View file

@ -1,25 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
public interface ParseData {
@Override
public String toString();
}

View file

@ -1,162 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Parser extends Configurable {
private HtmlParser htmlParser;
private ParseContext parseContext;
public Parser(CrawlConfig config) {
super(config);
htmlParser = new HtmlParser();
parseContext = new ParseContext();
}
public boolean parse(Page page, String contextURL) {
if (Util.hasBinaryContent(page.getContentType())) {
if (!config.isIncludeBinaryContentInCrawling()) {
return false;
} else {
page.setParseData(BinaryParseData.getInstance());
return true;
}
} else if (Util.hasPlainTextContent(page.getContentType())) {
try {
TextParseData parseData = new TextParseData();
parseData.setTextContent(new String(page.getContentData(), page
.getContentCharset()));
page.setParseData(parseData);
return true;
} catch (NullPointerException npe) {
// ignore, means it couldn't parse because it's not text
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
InputStream inputStream = null;
try {
inputStream = new ByteArrayInputStream(page.getContentData());
htmlParser.parse(inputStream, contentHandler, metadata,
parseContext);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
if (page.getContentCharset() == null) {
page.setContentCharset(metadata.get("Content-Encoding"));
}
HtmlParseData parseData = new HtmlParseData();
parseData.setText(contentHandler.getBodyText().trim());
parseData.setTitle(metadata.get(Metadata.TITLE));
Set<String> urls = new HashSet<String>();
String baseURL = contentHandler.getBaseUrl();
if (baseURL != null) {
contextURL = baseURL;
}
int urlCount = 0;
for (String href : contentHandler.getOutgoingUrls()) {
href = href.trim();
if (href.length() == 0) {
continue;
}
String hrefWithoutProtocol = href.toLowerCase();
if (href.startsWith("http://")) {
hrefWithoutProtocol = href.substring(7);
}
if (!hrefWithoutProtocol.contains("javascript:")
&& !hrefWithoutProtocol.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
if (url != null) {
urls.add(url);
urlCount++;
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
break;
}
}
}
}
List<WebURL> outgoingUrls = new ArrayList<WebURL>();
for (String url : urls) {
WebURL webURL = new WebURL();
webURL.setURL(url);
outgoingUrls.add(webURL);
}
parseData.setOutgoingUrls(outgoingUrls);
try {
if (page.getContentCharset() == null) {
parseData.setHtml(new String(page.getContentData()));
} else {
parseData.setHtml(new String(page.getContentData(), page
.getContentCharset()));
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return false;
}
page.setParseData(parseData);
return true;
}
}

View file

@ -1,37 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
public class TextParseData implements ParseData {
private String textContent;
public String getTextContent() {
return textContent;
}
public void setTextContent(String textContent) {
this.textContent = textContent;
}
@Override
public String toString() {
return textContent;
}
}

View file

@ -1,62 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.robotstxt;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class HostDirectives {
// If we fetched the directives for this host more than
// 24 hours, we have to re-fetch it.
private static final long EXPIRATION_DELAY = 24 * 60 * 1000L;
private RuleSet disallows = new RuleSet();
private RuleSet allows = new RuleSet();
private long timeFetched;
private long timeLastAccessed;
public HostDirectives() {
timeFetched = System.currentTimeMillis();
}
public boolean needsRefetch() {
return (System.currentTimeMillis() - timeFetched > EXPIRATION_DELAY);
}
public boolean allows(String path) {
timeLastAccessed = System.currentTimeMillis();
return !disallows.containsPrefixOf(path)
|| allows.containsPrefixOf(path);
}
public void addDisallow(String path) {
disallows.add(path);
}
public void addAllow(String path) {
allows.add(path);
}
public long getLastAccessTime() {
return timeLastAccessed;
}
}

View file

@ -1,63 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.robotstxt;
public class RobotstxtConfig {
/**
* Should the crawler obey Robots.txt protocol? More info on Robots.txt is
* available at http://www.robotstxt.org/
*/
private boolean enabled = true;
/**
* user-agent name that will be used to determine whether some servers have
* specific rules for this agent name.
*/
private String userAgentName = "crawler4j";
/**
* The maximum number of hosts for which their robots.txt is cached.
*/
private int cacheSize = 500;
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
public String getUserAgentName() {
return userAgentName;
}
public void setUserAgentName(String userAgentName) {
this.userAgentName = userAgentName;
}
public int getCacheSize() {
return cacheSize;
}
public void setCacheSize(int cacheSize) {
this.cacheSize = cacheSize;
}
}

View file

@ -1,101 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.robotstxt;
import java.util.StringTokenizer;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class RobotstxtParser {
private static final String PATTERNS_USERAGENT = "(?i)^User-agent:.*";
private static final String PATTERNS_DISALLOW = "(?i)Disallow:.*";
private static final String PATTERNS_ALLOW = "(?i)Allow:.*";
private static final int PATTERNS_USERAGENT_LENGTH = 11;
private static final int PATTERNS_DISALLOW_LENGTH = 9;
private static final int PATTERNS_ALLOW_LENGTH = 6;
public static HostDirectives parse(String content, String myUserAgent) {
HostDirectives directives = null;
boolean inMatchingUserAgent = false;
StringTokenizer st = new StringTokenizer(content, "\n");
while (st.hasMoreTokens()) {
String line = st.nextToken();
int commentIndex = line.indexOf("#");
if (commentIndex > -1) {
line = line.substring(0, commentIndex);
}
// remove any html markup
line = line.replaceAll("<[^>]+>", "");
line = line.trim();
if (line.length() == 0) {
continue;
}
if (line.matches(PATTERNS_USERAGENT)) {
String ua = line.substring(PATTERNS_USERAGENT_LENGTH).trim()
.toLowerCase();
if (ua.equals("*") || ua.contains(myUserAgent)) {
inMatchingUserAgent = true;
if (directives == null) {
directives = new HostDirectives();
}
} else {
inMatchingUserAgent = false;
}
} else if (line.matches(PATTERNS_DISALLOW)) {
if (!inMatchingUserAgent) {
continue;
}
String path = line.substring(PATTERNS_DISALLOW_LENGTH).trim();
if (path.endsWith("*")) {
path = path.substring(0, path.length() - 1);
}
path = path.trim();
if (path.length() > 0) {
directives.addDisallow(path);
}
} else if (line.matches(PATTERNS_ALLOW)) {
if (!inMatchingUserAgent) {
continue;
}
String path = line.substring(PATTERNS_ALLOW_LENGTH).trim();
if (path.endsWith("*")) {
path = path.substring(0, path.length() - 1);
}
path = path.trim();
directives.addAllow(path);
}
}
return directives;
}
}

View file

@ -1,132 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.robotstxt;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.http.HttpStatus;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class RobotstxtServer {
protected RobotstxtConfig config;
protected final Map<String, HostDirectives> host2directivesCache = new HashMap<String, HostDirectives>();
protected PageFetcher pageFetcher;
public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher) {
this.config = config;
this.pageFetcher = pageFetcher;
}
public boolean allows(WebURL webURL) {
if (!config.isEnabled()) {
return true;
}
try {
URL url = new URL(webURL.getURL());
String host = url.getHost().toLowerCase();
String path = url.getPath();
HostDirectives directives = host2directivesCache.get(host);
if (directives != null && directives.needsRefetch()) {
synchronized (host2directivesCache) {
host2directivesCache.remove(host);
directives = null;
}
}
if (directives == null) {
directives = fetchDirectives(host);
}
return directives.allows(path);
} catch (MalformedURLException e) {
e.printStackTrace();
}
return true;
}
private HostDirectives fetchDirectives(String host) {
WebURL robotsTxtUrl = new WebURL();
robotsTxtUrl.setURL("http://" + host + "/robots.txt");
HostDirectives directives = null;
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(robotsTxtUrl);
fetchResult.fetchContent(page);
if (Util.hasPlainTextContent(page.getContentType())) {
try {
String content;
if (page.getContentCharset() == null) {
content = new String(page.getContentData());
} else {
content = new String(page.getContentData(),
page.getContentCharset());
}
directives = RobotstxtParser.parse(content,
config.getUserAgentName());
} catch (Exception e) {
e.printStackTrace();
}
}
}
} finally {
fetchResult.discardContentIfNotConsumed();
}
if (directives == null) {
// We still need to have this object to keep track of the time we
// fetched it
directives = new HostDirectives();
}
synchronized (host2directivesCache) {
if (host2directivesCache.size() == config.getCacheSize()) {
String minHost = null;
long minAccessTime = Long.MAX_VALUE;
for (Entry<String, HostDirectives> entry : host2directivesCache
.entrySet()) {
if (entry.getValue().getLastAccessTime() < minAccessTime) {
minAccessTime = entry.getValue().getLastAccessTime();
minHost = entry.getKey();
}
}
host2directivesCache.remove(minHost);
}
host2directivesCache.put(host, directives);
}
return directives;
}
}

View file

@ -1,53 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.robotstxt;
import java.util.SortedSet;
import java.util.TreeSet;
public class RuleSet extends TreeSet<String> {
private static final long serialVersionUID = 1L;
@Override
public boolean add(String str) {
SortedSet<String> sub = headSet(str);
if (!sub.isEmpty() && str.startsWith(sub.last())) {
// no need to add; prefix is already present
return false;
}
boolean retVal = super.add(str);
sub = tailSet(str + "\0");
while (!sub.isEmpty() && sub.first().startsWith(str)) {
// remove redundant entries
sub.remove(sub.first());
}
return retVal;
}
public boolean containsPrefixOf(String s) {
SortedSet<String> sub = headSet(s);
// because redundant prefixes have been eliminated,
// only a test against last item in headSet is necessary
if (!sub.isEmpty() && s.startsWith(sub.last())) {
return true; // prefix substring exists
}
// might still exist exactly (headSet does not contain boundary)
return contains(s);
}
}

View file

@ -1,49 +0,0 @@
package edu.uci.ics.crawler4j.url;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class TLDList {
private final static Set<String> tldSet = new HashSet<String>();
public static boolean contains(String str) {
if (tldSet != null) {
return tldSet.contains(str);
} else {
return false;
}
}
static {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(
TLDList.class.getClassLoader().getResourceAsStream(
"tld-names.txt")));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.isEmpty() || line.startsWith("//")) {
continue;
}
tldSet.add(line);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
// Nothing we can do
}
}
}
}
}

View file

@ -1,217 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.url;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some
* parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class URLCanonicalizer {
public static String getCanonicalURL(String url) {
return getCanonicalURL(url, null);
}
public static String getCanonicalURL(String href, String context) {
try {
URL canonicalURL = new URL(UrlResolver.resolveUrl(
context == null ? "" : context, href));
String path = canonicalURL.getPath();
/*
* Normalize: no empty segments (i.e., "//"), no segments equal to
* ".", and no segments equal to ".." that are preceded by a segment
* not equal to "..".
*/
path = new URI(path).normalize().toString();
/*
* Convert '//' -> '/'
*/
int idx = path.indexOf("//");
while (idx >= 0) {
path = path.replace("//", "/");
idx = path.indexOf("//");
}
/*
* Drop starting '/../'
*/
while (path.startsWith("/../")) {
path = path.substring(3);
}
/*
* Trim
*/
path = path.trim();
final SortedMap<String, String> params = createParameterMap(canonicalURL
.getQuery());
final String queryString;
if (params != null && params.size() > 0) {
String canonicalParams = canonicalize(params);
queryString = (canonicalParams.isEmpty() ? "" : "?"
+ canonicalParams);
} else {
queryString = "";
}
/*
* Add starting slash if needed
*/
if (path.length() == 0) {
path = "/" + path;
}
/*
* Drop default port: example.com:80 -> example.com
*/
int port = canonicalURL.getPort();
if (port == canonicalURL.getDefaultPort()) {
port = -1;
}
/*
* Lowercasing protocol and host
*/
String protocol = canonicalURL.getProtocol().toLowerCase();
String host = canonicalURL.getHost().toLowerCase();
String pathAndQueryString = normalizePath(path) + queryString;
URL result = new URL(protocol, host, port, pathAndQueryString);
return result.toExternalForm();
} catch (MalformedURLException ex) {
return null;
} catch (URISyntaxException ex) {
return null;
}
}
/**
* Takes a query string, separates the constituent name-value pairs, and
* stores them in a SortedMap ordered by lexicographical order.
*
* @return Null if there is no query string.
*/
private static SortedMap<String, String> createParameterMap(
final String queryString) {
if (queryString == null || queryString.isEmpty()) {
return null;
}
final String[] pairs = queryString.split("&");
final Map<String, String> params = new HashMap<String, String>(
pairs.length);
for (final String pair : pairs) {
if (pair.length() == 0) {
continue;
}
String[] tokens = pair.split("=", 2);
switch (tokens.length) {
case 1:
if (pair.charAt(0) == '=') {
params.put("", tokens[0]);
} else {
params.put(tokens[0], "");
}
break;
case 2:
params.put(tokens[0], tokens[1]);
break;
}
}
return new TreeMap<String, String>(params);
}
/**
* Canonicalize the query string.
*
* @param sortedParamMap
* Parameter name-value pairs in lexicographical order.
* @return Canonical form of query string.
*/
private static String canonicalize(
final SortedMap<String, String> sortedParamMap) {
if (sortedParamMap == null || sortedParamMap.isEmpty()) {
return "";
}
final StringBuffer sb = new StringBuffer(100);
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
final String key = pair.getKey().toLowerCase();
if (key.equals("jsessionid") || key.equals("phpsessid")
|| key.equals("aspsessionid")) {
continue;
}
if (sb.length() > 0) {
sb.append('&');
}
sb.append(percentEncodeRfc3986(pair.getKey()));
if (!pair.getValue().isEmpty()) {
sb.append('=');
sb.append(percentEncodeRfc3986(pair.getValue()));
}
}
return sb.toString();
}
/**
* Percent-encode values according the RFC 3986. The built-in Java
* URLEncoder does not encode according to the RFC, so we make the extra
* replacements.
*
* @param string
* Decoded string.
* @return Encoded string per RFC 3986.
*/
private static String percentEncodeRfc3986(String string) {
try {
string = string.replace("+", "%2B");
string = URLDecoder.decode(string, "UTF-8");
string = URLEncoder.encode(string, "UTF-8");
return string.replace("+", "%20").replace("*", "%2A")
.replace("%7E", "~");
} catch (Exception e) {
return string;
}
}
private static String normalizePath(final String path) {
return path.replace("%7E", "~").replace(" ", "%20");
}
}

View file

@ -1,484 +0,0 @@
/**
* This class is adopted from Htmlunit with the following copyright:
*
* Copyright (c) 2002-2012 Gargoyle Software Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.url;
public final class UrlResolver {
/**
* Resolves a given relative URL against a base URL. See <a
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a> Section 4 for
* more details.
*
* @param baseUrl
* The base URL in which to resolve the specification.
* @param relativeUrl
* The relative URL to resolve against the base URL.
* @return the resolved specification.
*/
public static String resolveUrl(final String baseUrl,
final String relativeUrl) {
if (baseUrl == null) {
throw new IllegalArgumentException("Base URL must not be null");
}
if (relativeUrl == null) {
throw new IllegalArgumentException("Relative URL must not be null");
}
final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
return url.toString();
}
/**
* Returns the index within the specified string of the first occurrence of
* the specified search character.
*
* @param s
* the string to search
* @param searchChar
* the character to search for
* @param beginIndex
* the index at which to start the search
* @param endIndex
* the index at which to stop the search
* @return the index of the first occurrence of the character in the string
* or <tt>-1</tt>
*/
private static int indexOf(final String s, final char searchChar,
final int beginIndex, final int endIndex) {
for (int i = beginIndex; i < endIndex; i++) {
if (s.charAt(i) == searchChar) {
return i;
}
}
return -1;
}
/**
* Parses a given specification using the algorithm depicted in <a
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
*
* Section 2.4: Parsing a URL
*
* An accepted method for parsing URLs is useful to clarify the generic-RL
* syntax of Section 2.2 and to describe the algorithm for resolving
* relative URLs presented in Section 4. This section describes the parsing
* rules for breaking down a URL (relative or absolute) into the component
* parts described in Section 2.1. The rules assume that the URL has already
* been separated from any surrounding text and copied to a "parse string".
* The rules are listed in the order in which they would be applied by the
* parser.
*
* @param spec
* The specification to parse.
* @return the parsed specification.
*/
private static Url parseUrl(final String spec) {
final Url url = new Url();
int startIndex = 0;
int endIndex = spec.length();
// Section 2.4.1: Parsing the Fragment Identifier
//
// If the parse string contains a crosshatch "#" character, then the
// substring after the first (left-most) crosshatch "#" and up to the
// end of the parse string is the <fragment> identifier. If the
// crosshatch is the last character, or no crosshatch is present, then
// the fragment identifier is empty. The matched substring, including
// the crosshatch character, is removed from the parse string before
// continuing.
//
// Note that the fragment identifier is not considered part of the URL.
// However, since it is often attached to the URL, parsers must be able
// to recognize and set aside fragment identifiers as part of the
// process.
final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex);
if (crosshatchIndex >= 0) {
url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
endIndex = crosshatchIndex;
}
// Section 2.4.2: Parsing the Scheme
//
// If the parse string contains a colon ":" after the first character
// and before any characters not allowed as part of a scheme name (i.e.,
// any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
// <scheme> of the URL is the substring of characters up to but not
// including the first colon. These characters and the colon are then
// removed from the parse string before continuing.
final int colonIndex = indexOf(spec, ':', startIndex, endIndex);
if (colonIndex > 0) {
final String scheme = spec.substring(startIndex, colonIndex);
if (isValidScheme(scheme)) {
url.scheme_ = scheme;
startIndex = colonIndex + 1;
}
}
// Section 2.4.3: Parsing the Network Location/Login
//
// If the parse string begins with a double-slash "//", then the
// substring of characters after the double-slash and up to, but not
// including, the next slash "/" character is the network location/login
// (<net_loc>) of the URL. If no trailing slash "/" is present, the
// entire remaining parse string is assigned to <net_loc>. The double-
// slash and <net_loc> are removed from the parse string before
// continuing.
//
// Note: We also accept a question mark "?" or a semicolon ";" character
// as
// delimiters for the network location/login (<net_loc>) of the URL.
final int locationStartIndex;
int locationEndIndex;
if (spec.startsWith("//", startIndex)) {
locationStartIndex = startIndex + 2;
locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex);
if (locationEndIndex >= 0) {
startIndex = locationEndIndex;
}
} else {
locationStartIndex = -1;
locationEndIndex = -1;
}
// Section 2.4.4: Parsing the Query Information
//
// If the parse string contains a question mark "?" character, then the
// substring after the first (left-most) question mark "?" and up to the
// end of the parse string is the <query> information. If the question
// mark is the last character, or no question mark is present, then the
// query information is empty. The matched substring, including the
// question mark character, is removed from the parse string before
// continuing.
final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex);
if (questionMarkIndex >= 0) {
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
// The substring of characters after the double-slash and up to,
// but not
// including, the question mark "?" character is the network
// location/login
// (<net_loc>) of the URL.
locationEndIndex = questionMarkIndex;
startIndex = questionMarkIndex;
}
url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
endIndex = questionMarkIndex;
}
// Section 2.4.5: Parsing the Parameters
//
// If the parse string contains a semicolon ";" character, then the
// substring after the first (left-most) semicolon ";" and up to the end
// of the parse string is the parameters (<params>). If the semicolon
// is the last character, or no semicolon is present, then <params> is
// empty. The matched substring, including the semicolon character, is
// removed from the parse string before continuing.
final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex);
if (semicolonIndex >= 0) {
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
// The substring of characters after the double-slash and up to,
// but not
// including, the semicolon ";" character is the network
// location/login
// (<net_loc>) of the URL.
locationEndIndex = semicolonIndex;
startIndex = semicolonIndex;
}
url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
endIndex = semicolonIndex;
}
// Section 2.4.6: Parsing the Path
//
// After the above steps, all that is left of the parse string is the
// URL <path> and the slash "/" that may precede it. Even though the
// initial slash is not part of the URL path, the parser must remember
// whether or not it was present so that later processes can
// differentiate between relative and absolute paths. Often this is
// done by simply storing the preceding slash along with the path.
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
// The entire remaining parse string is assigned to the network
// location/login (<net_loc>) of the URL.
locationEndIndex = endIndex;
} else if (startIndex < endIndex) {
url.path_ = spec.substring(startIndex, endIndex);
}
// Set the network location/login (<net_loc>) of the URL.
if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
url.location_ = spec
.substring(locationStartIndex, locationEndIndex);
}
return url;
}
/*
* Returns true if specified string is a valid scheme name.
*/
private static boolean isValidScheme(final String scheme) {
final int length = scheme.length();
if (length < 1) {
return false;
}
char c = scheme.charAt(0);
if (!Character.isLetter(c)) {
return false;
}
for (int i = 1; i < length; i++) {
c = scheme.charAt(i);
if (!Character.isLetterOrDigit(c) && c != '.' && c != '+'
&& c != '-') {
return false;
}
}
return true;
}
/**
* Resolves a given relative URL against a base URL using the algorithm
* depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
*
* Section 4: Resolving Relative URLs
*
* This section describes an example algorithm for resolving URLs within a
* context in which the URLs may be relative, such that the result is always
* a URL in absolute form. Although this algorithm cannot guarantee that the
* resulting URL will equal that intended by the original author, it does
* guarantee that any valid URL (relative or absolute) can be consistently
* transformed to an absolute form given a valid base URL.
*
* @param baseUrl
* The base URL in which to resolve the specification.
* @param relativeUrl
* The relative URL to resolve against the base URL.
* @return the resolved specification.
*/
private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
final Url url = parseUrl(relativeUrl);
// Step 1: The base URL is established according to the rules of
// Section 3. If the base URL is the empty string (unknown),
// the embedded URL is interpreted as an absolute URL and
// we are done.
if (baseUrl == null) {
return url;
}
// Step 2: Both the base and embedded URLs are parsed into their
// component parts as described in Section 2.4.
// a) If the embedded URL is entirely empty, it inherits the
// entire base URL (i.e., is set equal to the base URL)
// and we are done.
if (relativeUrl.length() == 0) {
return new Url(baseUrl);
}
// b) If the embedded URL starts with a scheme name, it is
// interpreted as an absolute URL and we are done.
if (url.scheme_ != null) {
return url;
}
// c) Otherwise, the embedded URL inherits the scheme of
// the base URL.
url.scheme_ = baseUrl.scheme_;
// Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
// Step 7. Otherwise, the embedded URL inherits the <net_loc>
// (if any) of the base URL.
if (url.location_ != null) {
return url;
}
url.location_ = baseUrl.location_;
// Step 4: If the embedded URL path is preceded by a slash "/", the
// path is not relative and we skip to Step 7.
if ((url.path_ != null)
&& ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) {
url.path_ = removeLeadingSlashPoints(url.path_);
return url;
}
// Step 5: If the embedded URL path is empty (and not preceded by a
// slash), then the embedded URL inherits the base URL path,
// and
if (url.path_ == null) {
url.path_ = baseUrl.path_;
// a) if the embedded URL's <params> is non-empty, we skip to
// step 7; otherwise, it inherits the <params> of the base
// URL (if any) and
if (url.parameters_ != null) {
return url;
}
url.parameters_ = baseUrl.parameters_;
// b) if the embedded URL's <query> is non-empty, we skip to
// step 7; otherwise, it inherits the <query> of the base
// URL (if any) and we skip to step 7.
if (url.query_ != null) {
return url;
}
url.query_ = baseUrl.query_;
return url;
}
// Step 6: The last segment of the base URL's path (anything
// following the rightmost slash "/", or the entire path if no
// slash is present) is removed and the embedded URL's path is
// appended in its place. The following operations are
// then applied, in order, to the new path:
final String basePath = baseUrl.path_;
String path = "";
if (basePath != null) {
final int lastSlashIndex = basePath.lastIndexOf('/');
if (lastSlashIndex >= 0) {
path = basePath.substring(0, lastSlashIndex + 1);
}
} else {
path = "/";
}
path = path.concat(url.path_);
// a) All occurrences of "./", where "." is a complete path
// segment, are removed.
int pathSegmentIndex;
while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
path = path.substring(0, pathSegmentIndex + 1).concat(
path.substring(pathSegmentIndex + 3));
}
// b) If the path ends with "." as a complete path segment,
// that "." is removed.
if (path.endsWith("/.")) {
path = path.substring(0, path.length() - 1);
}
// c) All occurrences of "<segment>/../", where <segment> is a
// complete path segment not equal to "..", are removed.
// Removal of these path segments is performed iteratively,
// removing the leftmost matching pattern on each iteration,
// until no matching pattern remains.
while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
final String pathSegment = path.substring(0, pathSegmentIndex);
final int slashIndex = pathSegment.lastIndexOf('/');
if (slashIndex < 0) {
continue;
}
if (!"..".equals(pathSegment.substring(slashIndex))) {
path = path.substring(0, slashIndex + 1).concat(
path.substring(pathSegmentIndex + 4));
}
}
// d) If the path ends with "<segment>/..", where <segment> is a
// complete path segment not equal to "..", that
// "<segment>/.." is removed.
if (path.endsWith("/..")) {
final String pathSegment = path.substring(0, path.length() - 3);
final int slashIndex = pathSegment.lastIndexOf('/');
if (slashIndex >= 0) {
path = path.substring(0, slashIndex + 1);
}
}
path = removeLeadingSlashPoints(path);
url.path_ = path;
// Step 7: The resulting URL components, including any inherited from
// the base URL, are recombined to give the absolute form of
// the embedded URL.
return url;
}
/**
* "/.." at the beginning should be removed as browsers do (not in RFC)
*/
private static String removeLeadingSlashPoints(String path) {
while (path.startsWith("/..")) {
path = path.substring(3);
}
return path;
}
/**
* Class <tt>Url</tt> represents a Uniform Resource Locator.
*
* @author Martin Tamme
*/
private static class Url {
private String scheme_;
private String location_;
private String path_;
private String parameters_;
private String query_;
private String fragment_;
/**
* Creates a <tt>Url</tt> object.
*/
public Url() {
}
/**
* Creates a <tt>Url</tt> object from the specified <tt>Url</tt> object.
*
* @param url
* a <tt>Url</tt> object.
*/
public Url(final Url url) {
scheme_ = url.scheme_;
location_ = url.location_;
path_ = url.path_;
parameters_ = url.parameters_;
query_ = url.query_;
fragment_ = url.fragment_;
}
/**
* Returns a string representation of the <tt>Url</tt> object.
*
* @return a string representation of the <tt>Url</tt> object.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
if (scheme_ != null) {
sb.append(scheme_);
sb.append(':');
}
if (location_ != null) {
sb.append("//");
sb.append(location_);
}
if (path_ != null) {
sb.append(path_);
}
if (parameters_ != null) {
sb.append(';');
sb.append(parameters_);
}
if (query_ != null) {
sb.append('?');
sb.append(query_);
}
if (fragment_ != null) {
sb.append('#');
sb.append(fragment_);
}
return sb.toString();
}
}
}

View file

@ -1,177 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.url;
import java.io.Serializable;
import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.PrimaryKey;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
@Entity
public class WebURL implements Serializable {
private static final long serialVersionUID = 1L;
@PrimaryKey
private String url;
private int docid;
private int parentDocid;
private String parentUrl;
private short depth;
private String domain;
private String subDomain;
private String path;
/**
* Returns the unique document id assigned to this Url.
*/
public int getDocid() {
return docid;
}
public void setDocid(int docid) {
this.docid = docid;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
WebURL otherUrl = (WebURL) o;
return url != null && url.equals(otherUrl.getURL());
}
@Override
public String toString() {
return url;
}
/**
* Returns the Url string
*/
public String getURL() {
return url;
}
public void setURL(String url) {
this.url = url;
int domainStartIdx = url.indexOf("//") + 2;
int domainEndIdx = url.indexOf('/', domainStartIdx);
domain = url.substring(domainStartIdx, domainEndIdx);
subDomain = "";
String[] parts = domain.split("\\.");
if (parts.length > 2) {
domain = parts[parts.length - 2] + "." + parts[parts.length - 1];
int limit = 2;
if (TLDList.contains(domain)) {
domain = parts[parts.length - 3] + "." + domain;
limit = 3;
}
for (int i = 0; i < parts.length - limit; i++) {
if (subDomain.length() > 0) {
subDomain += ".";
}
subDomain += parts[i];
}
}
path = url.substring(domainEndIdx);
int pathEndIdx = path.indexOf('?');
if (pathEndIdx >= 0) {
path = path.substring(0, pathEndIdx);
}
}
/**
* Returns the unique document id of the parent page. The parent page is the
* page in which the Url of this page is first observed.
*/
public int getParentDocid() {
return parentDocid;
}
public void setParentDocid(int parentDocid) {
this.parentDocid = parentDocid;
}
/**
* Returns the url of the parent page. The parent page is the page in which
* the Url of this page is first observed.
*/
public String getParentUrl() {
return parentUrl;
}
public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}
/**
* Returns the crawl depth at which this Url is first observed. Seed Urls
* are at depth 0. Urls that are extracted from seed Urls are at depth 1,
* etc.
*/
public short getDepth() {
return depth;
}
public void setDepth(short depth) {
this.depth = depth;
}
/**
* Returns the domain of this Url. For 'http://www.example.com/sample.htm',
* domain will be 'example.com'
*/
public String getDomain() {
return domain;
}
public String getSubDomain() {
return subDomain;
}
/**
* Returns the path of this Url. For 'http://www.example.com/sample.htm',
* domain will be 'sample.htm'
*/
public String getPath() {
return path;
}
public void setPath(String path) {
this.path = path;
}
}

View file

@ -1,66 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.util;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import org.apache.log4j.Logger;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class IO {
private static final Logger logger = Logger.getLogger(IO.class.getName());
public static boolean deleteFolder(File folder) {
return deleteFolderContents(folder) && folder.delete();
}
public static boolean deleteFolderContents(File folder) {
if (logger.isDebugEnabled()) {
logger.debug("Deleting content of: " + folder.getAbsolutePath());
}
File[] files = folder.listFiles();
for (File file : files) {
if (file.isFile()) {
if (!file.delete()) {
return false;
}
} else {
if (!deleteFolder(file)) {
return false;
}
}
}
return true;
}
public static void writeBytesToFile(byte[] bytes, String destination) {
try {
FileChannel fc = new FileOutputStream(destination).getChannel();
fc.write(ByteBuffer.wrap(bytes));
fc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}

View file

@ -1,82 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Util {
public static byte[] long2ByteArray(long l) {
byte[] array = new byte[8];
int i, shift;
for (i = 0, shift = 56; i < 8; i++, shift -= 8) {
array[i] = (byte) (0xFF & (l >> shift));
}
return array;
}
public static byte[] int2ByteArray(int value) {
byte[] b = new byte[4];
for (int i = 0; i < 4; i++) {
int offset = (b.length - 1 - i) * 8;
b[i] = (byte) ((value >>> offset) & 0xFF);
}
return b;
}
public static int byteArray2Int(byte[] b) {
int value = 0;
for (int i = 0; i < 4; i++) {
int shift = (4 - 1 - i) * 8;
value += (b[i] & 0x000000FF) << shift;
}
return value;
}
public static long byteArray2Long(byte[] b) {
int value = 0;
for (int i = 0; i < 8; i++) {
int shift = (8 - 1 - i) * 8;
value += (b[i] & 0x000000FF) << shift;
}
return value;
}
public static boolean hasBinaryContent(String contentType) {
if (contentType != null) {
String typeStr = contentType.toLowerCase();
if (typeStr.contains("image") || typeStr.contains("audio")
|| typeStr.contains("video")
|| typeStr.contains("application")) {
return true;
}
}
return false;
}
public static boolean hasPlainTextContent(String contentType) {
if (contentType != null) {
String typeStr = contentType.toLowerCase();
if (typeStr.contains("text/plain")) {
return true;
}
}
return false;
}
}

View file

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry exported="true" kind="lib" path="imageio-ext-tiff-1.0.8.jar"/>
<classpathentry exported="true" kind="lib" path="imageio-ext-utilities-1.0.8.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>it.geosolutions</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,7 +0,0 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,12 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Geosolutions
Bundle-SymbolicName: it.geosolutions
Bundle-Version: 1.0.8
Bundle-ClassPath: imageio-ext-tiff-1.0.8.jar,
imageio-ext-utilities-1.0.8.jar
Export-Package: it.geosolutions.imageio.plugins.tiff,
it.geosolutions.imageio.utilities,
it.geosolutions.imageioimpl.plugins.tiff
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
Eclipse-BuddyPolicy: registered, ext, global

View file

@ -1,3 +0,0 @@
bin.includes = META-INF/,\
imageio-ext-tiff-1.0.8.jar,\
imageio-ext-utilities-1.0.8.jar

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry exported="true" kind="lib" path="activation.jar" sourcepath="javax.activationsrc.zip"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>javax.activation</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,7 +0,0 @@
#Thu Mar 26 11:17:44 CDT 2009
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,10 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Activation Plug-in
Bundle-SymbolicName: javax.activation
Bundle-Version: 1.0.0.qualifier
Bundle-ClassPath: activation.jar
Export-Package: com.sun.activation.registries,
com.sun.activation.viewers,
javax.activation
Bundle-RequiredExecutionEnvironment: JavaSE-1.6

Binary file not shown.

View file

@ -1,2 +0,0 @@
bin.includes = META-INF/,\
activation.jar

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry exported="true" kind="lib" path="mail.jar" sourcepath="javax.mailsrc.zip"/>
<classpathentry kind="output" path="bin"/>
</classpath>

View file

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>javax.mail</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

View file

@ -1,7 +0,0 @@
#Thu Mar 26 11:18:00 CDT 2009
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
org.eclipse.jdt.core.compiler.compliance=1.6
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.source=1.6

View file

@ -1,19 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: Mail Plug-in
Bundle-SymbolicName: javax.mail
Bundle-Version: 1.0.0.qualifier
Bundle-ClassPath: mail.jar,
.
Export-Package: com.sun.mail.handlers,
com.sun.mail.iap,
com.sun.mail.imap,
com.sun.mail.imap.protocol,
com.sun.mail.pop3,
com.sun.mail.smtp,
com.sun.mail.util,
javax.mail,
javax.mail.event,
javax.mail.internet,
javax.mail.search
Bundle-RequiredExecutionEnvironment: JavaSE-1.6

View file

@ -1,3 +0,0 @@
bin.includes = META-INF/,\
.,\
mail.jar

Binary file not shown.

View file

@ -1,22 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>javax.media.opengl.win64</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.pde.ManifestBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.pde.SchemaBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.pde.PluginNature</nature>
</natures>
</projectDescription>

View file

@ -1,8 +0,0 @@
Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: JOGL Win64 Specific Fragment
Bundle-SymbolicName: javax.media.opengl.win64
Bundle-Version: 1.14.0.qualifier
Fragment-Host: javax.media.opengl;bundle-version="1.1.1"
Bundle-RequiredExecutionEnvironment: JavaSE-1.6
Eclipse-PlatformFilter: (& (osgi.os=win32) (osgi.arch=x86_64))

View file

@ -1,5 +0,0 @@
bin.includes = META-INF/,\
gluegen-rt.dll,\
jogl.dll,\
jogl_awt.dll,\
jogl_cg.dll

View file

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
<classpathentry kind="output" path="bin"/>
</classpath>

Some files were not shown because too many files have changed in this diff Show more