StormCrawler DISCOVER and FETCH a website but nothing gets saved in docs

坚强是说给别人听的谎言 提交于 2019-12-24 23:16:28

问题


There is a website that I'm trying to crawl, the crawler DISCOVER and FETCH the URLs but there is nothing in docs. this is the website https://cactussara.ir. where is the problem?! And this is the robots.txt of this website:

User-agent: *
Disallow: /

And this is my urlfilters.json:

{
    "com.digitalpebble.stormcrawler.filtering.URLFilters": [
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLFilter",
            "name": "BasicURLFilter",
            "params": {
                "maxPathRepetition": 8,
                "maxLength": 8192
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.depth.MaxDepthFilter",
            "name": "MaxDepthFilter",
            "params": {
                "maxDepth": -1
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer",
            "name": "BasicURLNormalizer",
            "params": {
                "removeAnchorPart": true,
                "unmangleQueryString": true,
                "checkValidURI": true,
                "removeHashes": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.host.HostURLFilter",
            "name": "HostURLFilter",
            "params": {
                "ignoreOutsideHost": true,
                "ignoreOutsideDomain": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLNormalizer",
            "name": "RegexURLNormalizer",
            "params": {
                "regexNormalizerFile": "default-regex-normalizers.xml"
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
            "name": "RegexURLFilter",
            "params": {
                "regexFilterFile": "default-regex-filters.txt"
            }
        }
    ]
}

And this is crawler-conf.yaml:

# Default configuration for StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -conf
# when launching your extension of ConfigurableTopology.  

config: 
  fetcher.server.delay: 1.0
  # min. delay for multi-threaded queues
  fetcher.server.min.delay: 0.0
  fetcher.queue.mode: "byHost"
  fetcher.threads.per.queue: 1
  fetcher.threads.number: 10
  fetcher.max.urls.in.queues: -1
  fetcher.max.queue.size: -1
  # max. crawl-delay accepted in robots.txt (in seconds)
  fetcher.max.crawl.delay: 30
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is larger than fetcher.max.crawl.delay:
  #  (if false)
  #    skip URLs from this queue to avoid that any overlong
  #    crawl-delay throttles the crawler
  #  (if true)
  #    set the delay to fetcher.max.crawl.delay,
  #    making fetcher more aggressive than requested
  fetcher.max.crawl.delay.force: false
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is smaller (ev. less than one second) than the default delay:
  #  (if true)
  #    use the larger default delay (fetcher.server.delay)
  #    and ignore the shorter crawl-delay in the robots.txt
  #  (if false)
  #    use the delay specified in the robots.txt
  fetcher.server.delay.force: false

  # time bucket to use for the metrics sent by the Fetcher
  fetcher.metrics.time.bucket.secs: 10

  # SimpleFetcherBolt: if the delay required by the politeness
  # is above this value, the tuple is sent back to the Storm queue 
  # for the bolt on the _throttle_ stream.
  fetcher.max.throttle.sleep: -1

  # alternative values are "byIP" and "byDomain"
  partition.url.mode: "byHost"

  # metadata to transfer to the outlinks
  # used by Fetcher for redirections, sitemapparser, etc...
  # these are also persisted for the parent document (see below)
  # metadata.transfer:
  # - customMetadataName

  # lists the metadata to persist to storage
  # these are not transfered to the outlinks
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  metadata.track.path: true
  metadata.track.depth: true

  http.agent.name: "Anonymous Coward"
  http.agent.version: "1.0"
  http.agent.description: "built with StormCrawler ${version}"
  http.agent.url: "http://someorganization.com/"
  http.agent.email: "someone@someorganization.com"

  http.accept.language: "fa-IR,fa_IR,en-us,en-gb,en;q=0.7,*;q=0.3"
  http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  http.content.limit: -1
  http.store.headers: false
  http.timeout: 10000
  http.skip.robots: true

  # store partial fetches as trimmed content (some content has been fetched,
  # but reading more data from socket failed, eg. because of a network timeout)
  http.content.partial.as.trimmed: false

  # for crawling through a proxy:
  # http.proxy.host:
  # http.proxy.port:
  # okhttp only, defaults to "HTTP"
  # http.proxy.type: "SOCKS"
  # for crawling through a proxy with Basic authentication:
  # http.proxy.user:
  # http.proxy.pass:

  http.robots.403.allow: true

  # should the URLs be removed when a page is marked as noFollow
  robots.noFollow.strict: false

  # Guava caches used for the robots.txt directives 
  robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
  robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"

  protocols: "http,https,file"
  http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"

  # navigationfilters.config.file: "navigationfilters.json"
  # selenium.addresses: "http://localhost:9515"
  selenium.implicitlyWait: 0
  selenium.pageLoadTimeout: -1
  selenium.setScriptTimeout: 0
  selenium.instances.num: 1
  selenium.capabilities:
    takesScreenshot: false
    loadImages: false
    javascriptEnabled: true
    # illustrates the use of the variable for user agent
    # phantomjs.page.settings.userAgent: "$userAgent"
    # ChromeDriver config
    # goog:chromeOptions:
    #   args: 
    #      - "--headless"
    #      - "--disable-gpu"
    #      - "--mute-audio"

  # DelegatorRemoteDriverProtocol
  selenium.delegated.protocol: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"

  # no url or parsefilters by default
  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"

  # JSoupParserBolt
  jsoup.treat.non.html.as.error: false
  parser.emitOutlinks: true
  parser.emitOutlinks.max.per.page: -1
  track.anchors: true
  detect.mimetype: true
  detect.charset.maxlength: 10000

  # filters URLs in sitemaps based on their modified Date (if any)
  sitemap.filter.hours.since.modified: -1

  # staggered scheduling of sitemaps
  sitemap.schedule.delay: -1

  # whether to add any sitemaps found in the robots.txt to the status stream
  # used by fetcher bolts
  sitemap.discovery: false

  # Default implementation of Scheduler
  scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: 1440

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched succesfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true
  # fetchInterval.isFeed=true: 10

  # max number of successive fetch errors before changing status to ERROR
  max.fetch.errors: 3

  # Guava cache use by AbstractStatusUpdaterBolt for DISCOVERED URLs
  status.updater.use.cache: true
  status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"

  # Can also take "MINUTE" or "HOUR"
  status.updater.unit.round.date: "SECOND"

  # configuration for the classes extending AbstractIndexerBolt
  # indexer.md.filter: "someKey=aValue"
  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.text.maxlength: -1
  indexer.canonical.name: "canonical"
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description

Thanks in advance.


回答1:


The pages contain

<meta name="robots" content="noindex,follow"/>

which are found by the parser and causes the indexer bolt to skip the page.

This should be confirmed in the metrics where Filtered should be the same number as the pages fetched.

http.skip.robots does not apply to the directives set in the page itself.



来源:https://stackoverflow.com/questions/57955781/stormcrawler-discover-and-fetch-a-website-but-nothing-gets-saved-in-docs

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!