# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup #导入bs4中的BeautifulSoup
import pymysql
db = pymysql.connect(host='localhost',
user='root',
password='mysql123',
db='58tc',
charset='utf8mb4',
# 使用cursor()方法获取操作游标
def urls(url): #定义一个入口,return解析过的网址源码
res = requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
return soup
url=['http://sz.58.com/nanshan/zufang/0/?minprice=0_1600&PGTID=0d300008-0071-3991-2740-1076d93269e7&ClickID=2']
for x in range(2,13): #因为58同城的页面网址是由JS计算而得的,总结规律发现是一下u中的'pn?'中‘?’为页码
u='http://sz.58.com/nanshan/zufang/0/pn'+str(x)+'/?minprice=0_1600'
url.append(u)
for y,m in zip(url,range(13)): #y就是所有url,urls(url)就能解析所有的url
soup=urls(y)
for info,n in zip(soup.select('li'),range(35)):
space=info.select('.des .room')[0].text
c=info.select('.listliright .sendTime')[0].text.replace(" ",'')
a=info.select('.listliright .money')[0].text
SPACE = re.findall('.室.厅.卫',space) #由于58得到的信息中又很多空格,而且无法用strip()去除,这里使用了repalace,加上re
AREA = re.findall('..㎡',space) #两种方法结合使用,去除了空格,得到了一个List
TIME = re.findall('\r\n(....)',c)
PRINCE = re.findall('\n(.......)',a)
HREF1 = info.select('.des a')[0]['href']
HREF2 = HREF1+'?from='+str(m)+'-list-'+str(n)
HREF = [HREF2]
# print(AREA, SPACE, TIME, PRINCE, HREF)
if(TIME == []): #爬取的TIME和SPACE中有空值,此处用IF简单处理
TIME = ['NULL']
if (SPACE == []):
SPACE = ['NULL']
cursor.execute(sql, (AREA, SPACE, TIME, PRINCE, HREF)) #这里逗号前面要加空格,不然无法导入MySQL
db.commit()
db.close()
import requests
import re
from bs4 import BeautifulSoup #导入bs4中的BeautifulSoup
import pymysql
db = pymysql.connect(host='localhost',
user='root',
password='mysql123',
db='58tc',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
# cursor.execute("DROP TABLE IF EXISTS employee") 创建mysql的table,因为我早创建好了,这段注释掉了 # sql1 = """CREATE TABLE employee( # AREA VARCHAR(20) , # SPACE VARCHAR(20), # TIME VARCHAR(20), # PRINCE VARCHAR(20), # HREF VARCHAR(300))""" # cursor.execute(sql1)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
sql = "INSERT INTO EX(`AREA`,`SPACE`,`TIME`,`PRINCE`,`HREF`) VALUES (%s,%s,%s,%s,%s)"
def urls(url): #定义一个入口,return解析过的网址源码
res = requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
return soup
url=['http://sz.58.com/nanshan/zufang/0/?minprice=0_1600&PGTID=0d300008-0071-3991-2740-1076d93269e7&ClickID=2']
for x in range(2,13): #因为58同城的页面网址是由JS计算而得的,总结规律发现是一下u中的'pn?'中‘?’为页码
u='http://sz.58.com/nanshan/zufang/0/pn'+str(x)+'/?minprice=0_1600'
url.append(u)
for y,m in zip(url,range(13)): #y就是所有url,urls(url)就能解析所有的url
soup=urls(y)
for info,n in zip(soup.select('li'),range(35)):
space=info.select('.des .room')[0].text
c=info.select('.listliright .sendTime')[0].text.replace(" ",'')
a=info.select('.listliright .money')[0].text
SPACE = re.findall('.室.厅.卫',space) #由于58得到的信息中又很多空格,而且无法用strip()去除,这里使用了repalace,加上re
AREA = re.findall('..㎡',space) #两种方法结合使用,去除了空格,得到了一个List
TIME = re.findall('\r\n(....)',c)
PRINCE = re.findall('\n(.......)',a)
HREF1 = info.select('.des a')[0]['href']
HREF2 = HREF1+'?from='+str(m)+'-list-'+str(n)
HREF = [HREF2]
# print(AREA, SPACE, TIME, PRINCE, HREF)
if(TIME == []): #爬取的TIME和SPACE中有空值,此处用IF简单处理
TIME = ['NULL']
if (SPACE == []):
SPACE = ['NULL']
cursor.execute(sql, (AREA, SPACE, TIME, PRINCE, HREF)) #这里逗号前面要加空格,不然无法导入MySQL
db.commit()
db.close()
来源:CSDN
作者:笨鸟后飞唯勤补拙
链接:https://blog.csdn.net/goldenlake90/article/details/76168190