Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as Python by Belxjander ( 15 years ago )
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from Arachnid.items import ArachnidItem
class AnimeWallpaperSpider(CrawlSpider):
name = "www.animewallpapers.com"
allowed_domains = ["www.animewallpapers.com",
"media.animewallpapers.com"]
start_urls = [ "http://www.animewallpapers.com/wallpapers/",
"http://www.animewallpapers.com/ecchi/",
"http://www.animewallpapers.com/game/" ]
rules = (
Rule(SgmlLinkExtractor(allow='', deny='')),
Rule(SgmlLinkExtractor(allow=''),callback='parse')
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
selection = hxs.select('//@href')
items = []
for site in selection:
item = ArachnidItem()
item['link']=site.select('//@href').extract()
items.append(item)
return items
Revise this Paste