# 需求: 编写爬虫项目与Django项目详解和, 将爬取到的数据展示到前端页面上
# spider编写:
import scrapy
from dl.items import DlItem
class PSpider(scrapy.Spider):
name = 'p'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.kuaidaili.com/free/']
def parse(self, response):
# print(response)
tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')
# print(tr_list)
for tr in tr_list:
ip = tr.xpath('./td[1]/text()').extract_first()
port = tr.xpath('./td[2]/text()').extract_first()
typ = tr.xpath('./td[3]/text()').extract_first()
protocal = tr.xpath('./td[4]/text()').extract_first()
position = tr.xpath('./td[5]/text()').extract_first()
# print(ip, port, protocal, position)
item = DlItem()
item['ip'] = ip
item['port'] = port
item['typ'] = typ
item['protocal'] = protocal
item['position'] = position
print(item)
yield item
# items编码
import scrapy
class DlItem(scrapy.Item):
ip = scrapy.Field()
port = scrapy.Field()
typ = scrapy.Field()
protocal = scrapy.Field()
position = scrapy.Field()
# Django项目创建与所有配置:
1.models创建:
from django.db import models
# Create your models here.
class Proxy(models.Model):
ip = models.CharField(max_length=50)
port = models.CharField(max_length=50)
typ = models.CharField(max_length=50)
protocal = models.CharField(max_length=50)
position = models.CharField(max_length=50)
2.在scrapy框架项目中嵌入django
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'proxyscan.settings'
# 手动初始化Django:
import django
django.setup()
3.修改爬虫item:
import scrapy
from scrapy_djangoitem import DjangoItem
from proxy import models
class DlItem(DjangoItem):
django_model = models.Proxy
4.pipeline编码:
class DlPipeline(object):
def process_item(self, item, spider):
print('开启数据库, 进行数据存储')
item.save()
print('关闭数据库')
return item
5.Django项目迁移数据库与admin后台配置
Python manage.py makemigrations
python manage.py migrate
from proxy.models import Proxy
admin.site.register(Proxy)
# 创建超级用户:
Python manage.py createsuperuser
# 路由:
from django.conf.urls import url
from django.contrib import admin
from proxy.views import index
urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^index/', index),
]
# 视图函数:
from django.shortcuts import render
from proxy.models import Proxy
def index(requests):
p = Proxy.objects.all()
return render(requests, 'index.html', {"p":p})
# 前端代码:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script src="https://cdn.bootcss.com/jquery/3.4.1/jquery.min.js"></script>
<link href="https://cdn.bootcss.com/twitter-bootstrap/4.3.1/css/bootstrap.min.css" rel="stylesheet">
</head>
<body>
<div class="container">
<div class="row" >
<div class="col-md-10 col-md-offset-2" style="margin:0 auto">
<div class="panel panel-primary">
<div class="panel-heading" style="margin-top:50px">
<h3 class="panel-title">代理IP一览表</h3>
</div>
<div class="panel-body">
<table class="table table-striped">
<thead>
<tr>
<th>IP</th>
<th>Port</th>
<th>Type</th>
<th>Protocal</th>
<th>Positon</th>
</tr>
</thead>
<tbody>
{% for i in p %}
<tr>
<th>{{ i.ip }}</th>
<td>{{ i.port }}</td>
<td>{{ i.typ }}</td>
<td>{{ i.protocal }}</td>
<td>{{ i.position }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</body>
</html>
来源:oschina
链接:https://my.oschina.net/u/4355951/blog/4092506