1 23456
|
# Python prototype for LinkSearch - http://linksearch.sourceforge.net
#
# "main" crawler/indexer program
#
# Copyright (C) 2000 Andreas Harth (aharth@users.sourceforge.net)
Language: Python License: GPL (C) 2000 Andreas Harth (aharth@users.sourceforge.net) LOC: 109 SourceForge : LinkSearch: Google-like Search Engine (project search) : .../l/linksearch/linksearch/src/server.py
|
#
# crawler.py
#
# Python prototype for LinkSearch - http://linksearch.sourceforge.net
Language: Python License: GPL (C) 2000 Andreas Harth (aharth@users.sourceforge.net) LOC: 24 SourceForge : LinkSearch: Google-like Search Engine (project search) : .../l/linksearch/linksearch/src/globals.py
|
from anole.core.crawlhandler import CrawlHandler
from anole.core.fetcher import Fetcher
from anole.core.crawler import Crawler
from anole.core.processormgr import ProcessorMgr
import threading
Language: Python LOC: 66 Google : anole-spider - a python spider (project search) : .../anole-spider/trunk/anole/core/factory.py
|
# -- coding: latin-1
""" crawler.py - Module which does crawling and downloading
of urls from the web. This module is part of HarvestMan program.
Rewrite this to use twisted.
Language: Python LOC: 4 BerliOS : HarvestMan (project search) : .../harvestman/harvestman/HarvestMan-twisted/HarvestMan/crawler.py
|
# -- coding: latin-1
""" urlqueue.py - Module which controls queueing of urls
created by crawler threads. This is part of the HarvestMan
program.
Rewrite this to use twisted.
Language: Python LOC: 5 BerliOS : HarvestMan (project search) : .../harvestman/harvestman/HarvestMan-twisted/HarvestMan/urlqueue.py
|
"""
Noti - crawler for http://www.montevideo.com.uy
"""
from datetime import datetime
Language: Python LOC: 27 Google : noti - Noti es un framework de publicaci?n... c... (project search) : .../notiuy/apps/noti/crawlers/montevideo.py
|
# Create a Document instance representing start url
doc= ruya.Document(ruya.Uri(url))
# Create a new crawler configuration object
cfg= ruya.Config(ruya.Config.CrawlConfig(levels= 1, crawldelay= 5), ruya.Config.RedirectConfig(), ruya.Config.LogConfig())
# Use a single-domain breadth crawler with crawler configuration
Language: Python LOC: 34 Spider_20090529_inc : Ruya (project search) : .../0015/ruya/ruya-1.0.zip/ruya-1.0/example.py
|
from BeautifulSoup import BeautifulSoup
from urlparse import urljoin
class Crawler(object):
user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.7) Gecko/2007091417 Firefox/2.0.0.7"
crawled_urls = {}
Language: Python LOC: 33 Google : wiki-crawler - This is a web crawler speci...#39;s (project search) : .../Google/w/wiki-crawler/trunk/crawler.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../009/btqueue/BTQueue-0.1.3-489.2.4.src.rpm/BTQueue-0.1.3/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../btqueue/BTQueue-0.1.3-489.2.4.noarch.rpm/usr/bin/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../009/btqueue/BTQueue-0.1.3/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../009/btqueue/BTQueue-0.1.3-489.2.3.src.rpm/BTQueue-0.1.3/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../009/btqueue/BTQueue-0.1.3-489.2.2.src.rpm/BTQueue-0.1.3/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../btqueue/BTQueue-0.1.3-489.2.3.noarch.rpm/usr/bin/btqueue.py
|
from BitQueue.optparse import OptionParser
from BitQueue import version
parser = OptionParser(usage='btqueue.py [-r|--root_path path] scheduler|crawler|remote|query|add [args...]',
version=version)
parser.add_option("-r","--root",dest="root_path",default=None,
Language: Python LOC: 101 Spider_20090227_inc : BitTorrent Queue Manager (project search) : .../btqueue/BTQueue-0.1.3-489.2.2.noarch.rpm/usr/bin/btqueue.py
|
#spider
print 'spider'
spider.crawler(100)
print 'about to import indexer'
import indexer
Language: Python LOC: 13 Spider_20081217_inc : Tyriel (project search) : .../inc_003/tyriel/tyriel-0.1.0.tgz/tyriel-0.1.0/initiate.py
|
"""
Noti: crawler for http://www.utlimasnoticias.com.uy
"""
from urllib import urlopen
Language: Python LOC: 41 Google : noti - Noti es un framework de publicaci?n... c... (project search) : .../notiuy/apps/noti/crawlers/ultimasnoticias.py
|
"""
Noti: Crawler for www.elpais.com.uy
"""
from datetime import date
Language: Python LOC: 31 Google : noti - Noti es un framework de publicaci?n... c... (project search) : .../notiuy/apps/noti/crawlers/elpais.py
|
"""
Noti - Crawler for http://observa.com.uy
"""
from datetime import datetime
Language: Python LOC: 35 Google : noti - Noti es un framework de publicaci?n... c... (project search) : .../notiuy/apps/noti/crawlers/observa.py
|
"""
Noti - Crawler for http://www.larepublica.com.uy
"""
import re
Language: Python LOC: 41 Google : noti - Noti es un framework de publicaci?n... c... (project search) : .../notiuy/apps/noti/crawlers/larepublica.py
|
# IN
# MM Jeweled Torque
# PR Silt Crawler
# Mirage Serpent
#
Language: Python LOC: 9 SourceForge : NetMage (project search) : .../netmage/effect/tests/games/lessTrivialGame.py
|
# IN
# MM Jeweled Torque
# PR Silt Crawler
# Mirage Serpent
#
Language: Python LOC: 9 SourceForge : NetMage (project search) : .../netmage/io/tests/games/lessTrivialGame.py
|
from anole.core.crawlapp import CrawlApp as App
from anole.core.fetcher import HttpFetcher as Fetcher
from anole.core.crawler import Crawler
from anole.core.processormgr import ProcessorMgr
from anole.core.dns import SimpleDns as Dns
Language: Python LOC: 77 Google : anole-spider - a python spider (project search) : .../Google/a/anole-spider/trunk/anole-manage.py
|
#-*- coding: UTF-8 -*-
'''
U{Ruya<http://ruya.sourceforge.net/>} I{Arabic name meaning "sight, vision"} is a Python-based crawler for crawling English...
B{It is targeted solely towards developers who want crawling functionality in their code}.
Some important features of this tool are-
Language: Python LOC: 1171 Spider_20090529_inc : Ruya (project search) : .../0015/ruya/ruya-1.0.zip/ruya-1.0/ruya.py
|
# -- coding: latin-1
""" crawler.py - Module which does crawling and downloading
of urls from the web. This module is part of HarvestMan program.
Author: Anand B Pillai (abpillai at gmail dot com).
Language: Python LOC: 495 BerliOS : HarvestMan (project search) : .../harvestman/harvestman/HarvestMan/HarvestMan/crawler.py
|
1 23456
|