I\'d been trying to scrape some date from as asp.net website, the start page should be the following one: http://www.e3050.com/Items.aspx?cat=SON
First, I want to displa
Check out this here is an exact solution..
in parse method selecting 50 products per page
in page_rs_50 handled pagination
start_urls = ['http://www.e3050.com/Items.aspx?cat=SON']
pro_urls = [] # all product Urls
def parse(self, response): # select 50 products on each page
yield FormRequest.from_response(response,
formdata={'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pagesddl': '50',
'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$sortddl': 'Price(ASC)'},
meta={'curr': 1, 'total': 0, 'flag': True},
dont_click=True,
callback=self.page_rs_50)
def page_rs_50(self, response): # paginate the pages
hxs = HtmlXPathSelector(response)
curr = int(response.request.meta['curr'])
total = int(response.request.meta['total'])
flag = response.request.meta['flag']
self.pro_urls.extend(hxs.select(
"//td[@class='name']//a[contains(@id,'ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_itemslv_ctrl')]/@href"
).extract())
if flag:
total = hxs.select(
"//span[@id='ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_lbl_pagesizeBtm']/text()").re('\d+')[0]
if curr < total:
curr += 1
yield FormRequest.from_response(response,
formdata={'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pagesddl': '50',
'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$sortddl': 'Price(ASC)',
'ctl00$ctl00$ScriptManager1': 'ctl00$ctl00$ScriptManager1|ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pager1$ctl00$ctl01'
, '__EVENTTARGET': 'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pager1$ctl00$ctl01',
'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$hfVSFileName': hxs.select(
".//input[@id='ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_hfVSFileName']/@value").extract()[
0]},
meta={'curr': curr, 'total': total, 'flag': False},
dont_click=True,
callback=self.page_rs_50
)
else:
for pro in self.pro_urls:
yield Request("http://www.e3050.com/%s" % pro,
callback=self.parse_product)
def parse_product(self, response):
pass
#TODO Implementation Required For Parsing
I did not extensively research your code, but i see something strange:
# Get last page number
last_page = hxs.select('//span[@id="ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_lbl_PageSize"]/text()').extract()[0]
i = 1
# preparing requests for each page
while i < (int(last_page) / 5) + 1:
requests.append(Request("http://www.e3050.com/Items.aspx?cat=SON", callback=self.parse_product))
i +=1
First, instead of these manipulations with i
, you can do:
for i in xrange(1, last_page // 5 + 1):
Then you do:
requests.append(Request("http://www.e3050.com/Items.aspx?cat=SON", callback=self.parse_product))
Are you creating many requests to the same URL?