目标:开发一个古诗词智能问答应用,接入微信,可以在微信端利用文字或语音方式实现古诗词接龙。
- 设计知识图谱结构;
- 爬取古诗词写入Neo4j;
代码如下:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen,urlparse,urlsplit,Request
import urllib.request
import re
from base import writefile,gethtml
import csv
import codecs
import random
import py2neo
from py2neo import Graph,Node,Relationship
ua_list = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36",#Chrome
"Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0",#firwfox
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",#IE
"Opera/9.99 (Windows NT 5.1; U; zh-CN) Presto/9.9.9",#Opera
]
def not_empty(s):
return s and s.strip()
# 创建list存储节点信息,节点不允许重复创建
list_sub = []
node=set()
main_node = set ( )
unique_title_type = set ( )
def get_everypoetry(sub_url,author,dynasty):
req1 = urllib.request.Request ( sub_url, headers={'User-agent' : ua} )
html1 = urlopen ( req1 ).read ( )
soup1 = BeautifulSoup ( html1, 'lxml' )
# 开始获取诗词内容
# 1先获取该诗词连接,跳转到对应诗词界面才能够获取到诗词类型
page = soup1.find_all ( 'div', {'class' : 'shici_list_main'} )
for item in page :
# 获取到诗词链接
text0 = item.find ( 'a', {'target' : "_blank"} ) ['href']
# 访问诗词页面
content_url = 'http://www.shicimingju.com' + text0
req2 = urllib.request.Request ( content_url, headers={'User-agent' : ua} )
html2 = urlopen ( req2 ).read ( )
soup2 = BeautifulSoup ( html2, 'lxml' )
# 此时可以获取诗词内容和诗词类别
# title1=soup2.find_all ( 'div', {'id' : 'item_div'} )
# 诗词标题
try:
title = soup2.find ( 'h1' ).text
except:
print("获取title报错")
print(content_url)
#添加诗词节点
if title not in node:
try :
graph.run (
"MERGE (title:Poetry {aname:'" + title + "'})" )
node.add ( title )
# 添加诗词跟作者关系
graph.run (
"match (p:Person{name:'" + author + "'}),(t:Poetry{aname:'" + title + "'})" + "CREATE (p)-[:write]->(t)" )
except :
print ( "title写入Neo4j报错" )
print ( content_url )
# 诗词内容
try:
#诗词内容不能在存储时分解,可以放在数据检索时再行分词
contents = soup2.find ( 'div', {'class' : 'item_content'} ).text.strip ( )
except:
print ( "获取诗词内容报错" )
print ( content_url )
try:
graph.run (
"match (p:Poetry {aname:'" + title + "'}) set p.content ='" + contents + "'" )
except:
print ( "获取诗词内容报错" )
print ( content_url )
# 诗词赏析
try:
appreciation1 = soup2.find ( 'div', {'class' : 'shangxi_content'} )
appreciation = soup2.find ( 'div', {'class' : 'shangxi_content'} ).text.strip()
graph.run (
"match (t:Poetry {aname:'" + title + "'}) set t.zapp='" + appreciation + "'" )
except:
print("获取赏析报错")
appreciation = ""
graph.run (
"match (t:Poetry {aname:'" + title + "'}) set t.zapp='" + appreciation + "'" )
print(content_url)
# 诗词类型
try:
poetry_type = soup2.find ( 'div', {'class' : 'shici-mark'} ).text.strip ( ).split ( '\n' )
except :
print ( "type读写报错" )
poetry_type=['类型','其它']
print ( content_url )
type_len = len ( poetry_type )
poetry_type_list = []
if type_len > 2 :
for n in range ( 1, type_len ) :
poetry_type_list.append ( poetry_type [n].strip ( ) )
else :
poetry_type_list.append ( poetry_type [1] )
while '' in poetry_type_list :
poetry_type_list.remove ( '' )
for ty in poetry_type_list :
ty = ty.strip ( )
unique_title_type.add ( title + "," + ty )
if ty not in node :
graph.run (
"MERGE (types:Types {name:'" + ty + "'})" )
node.add ( ty )
# 添加诗词跟作者关系
try:
if title + ty not in unique_title_type:
graph.run (
"MATCH (t:Poetry{aname:'" + title + "'}),(p:Types{name:'" + ty + "'})" + "CREATE (t)-[:belong_to]->(p)" )
unique_title_type.add ( title + ty )
except:
print("创建诗词与类型关系报错")
if __name__ == "__main__":
#连接图数据库
graph = Graph (
"http://localhost:11003/",
username="admin",
password="password"
)
# 创建list存储节点信息,节点不允许重复
# list_main = []
flag=True
url='http://www.shicimingju.com/chaxun/zuozhe/'
for i in range(1,652):
ua = random.choice ( ua_list )
main_url = url+str(i)+'.html'
# html, status = gethtml.get_html ( url )
req = urllib.request.Request ( main_url, headers={'User-agent' : ua} )
html = urlopen ( req ).read ( )
soup = BeautifulSoup ( html, 'lxml' )
try:
# 主页面要获取诗人、朝代、简介、数量
# 诗词作者
author = soup.find ( 'div', {'class' : 'card about_zuozhe'} ).find ( 'h4' ).text # Node
# 诗词简介
brief = soup.find ( 'div', {'class' : 'des'} ).text # property
# 诗人朝代
dynasty = soup.find ( 'div', {'class' : 'aside_val'} ).text # Node
# 诗人写诗数量
total_poetry = soup.find ( 'div', {'class' : 'aside_right'} ).find ( 'div', {
'class' : 'aside_val'} ).text # property
if author not in main_node :
graph.run (
"MERGE (author:Person {name:'" + author + "', brief:'" + brief + "', total_poetry:'" + total_poetry + "'})" )
main_node.add(author)
if dynasty not in main_node :
graph.run (
"MERGE (dynasty:Time {name:'" + dynasty + "'})" )
main_node.add(dynasty)
if author not in main_node or dynasty not in main_node or flag:
graph.run (
"MATCH (p:Person{name:'" + author + "'}),(t:Time{name:'" + dynasty + "'})" + "CREATE (p)-[:live_in]->(t)" )
flag=False
# 进入子页面读取诗词
# list_main.append ( author )
# list_main.append ( dynasty )
except:
print("获取诗人、数量、年代、简介")
get_everypoetry ( main_url,author,dynasty )
# page=soup.find_all('div',{'id':'list_nav_all'})
# haha=len(page)
#获取总共有多少页
try:
number=soup.find('div',{'id':'list_nav_all'}).find_all('a')
except:
print("获取页数报错")
page_number=len(number)
# href=number[0]
for j in range(2,page_number):
sub_url=url+str(i)+'_'+str(j)+'.html'
get_everypoetry(sub_url,author,dynasty)
punc = ':· - ...:-'
list_item = []
知识图谱效果如下:
来源:CSDN
作者:haiziccc
链接:https://blog.csdn.net/haiziccc/article/details/103970980