在写爬取页面a标签下href属性的时候,有这样一个问题,如果a标签下没有href这个属性则会报错,如下:
百度了有师傅用正则匹配的,方法感觉都不怎么好,查了BeautifulSoup的官方文档,发现一个不错的方法,如下图:
官方文档链接:https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
has_attr() 这个方法可以判断某标签是否存在某属性,如果存在则返回 True
解决办法:
为美观使用了匿名函数
soup_a = soup.find_all(lambda tag:tag.has_attr('href'))
最终代码:
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 # Author:Riy 4 5 import time 6 import requests 7 import sys 8 import logging 9 from bs4 import BeautifulSoup 10 from requests.exceptions import RequestException 11 from multiprocessing import Process, Pool 12 13 14 logging.basicConfig( 15 level=logging.DEBUG, 16 format='%(levelname)-10s: %(message)s', 17 ) 18 19 20 class down_url: 21 def download(self, url): 22 '''爬取url''' 23 try: 24 start = time.time() 25 logging.debug('starting download url...') 26 response = requests.get(url) 27 page = response.content 28 soup = BeautifulSoup(page, 'lxml') 29 soup_a = soup.find_all(lambda tag:tag.has_attr('href')) 30 soup_a_href_list = [] 31 # print(soup_a) 32 for k in soup_a: 33 # print(k) 34 soup_a_href = k['href'] 35 if soup_a_href.find('.'): 36 # print(soup_a_href) 37 soup_a_href_list.append(soup_a_href) 38 print(f'运行了{time.time()-start}秒') 39 except RecursionError as e: 40 print(e) 41 return soup_a_href_list 42 43 44 def write(soup_a_href_list, txt): 45 '''下载到txt文件''' 46 logging.debug('starting write txt...') 47 with open(txt, 'a', encoding='utf-8') as f: 48 for i in soup_a_href_list: 49 f.writelines(f'{i}\n') 50 print(f'已生成文件{txt}') 51 52 53 def help_memo(self): 54 '''查看帮助''' 55 print(''' 56 -h or --help 查看帮助 57 -u or --url 添加url 58 -t or --txt 写入txt文件 59 ''') 60 61 62 def welcome(self): 63 '''欢迎页面''' 64 desc = ('欢迎使用url爬取脚本'.center(30, '*')) 65 print(desc) 66 67 68 def main(): 69 '''主函数''' 70 p = Pool(3) 71 p_list = [] 72 temp = down_url() 73 logging.debug('starting run python...') 74 try: 75 if len(sys.argv) == 1: 76 temp.welcome() 77 temp.help_memo() 78 elif sys.argv[1] in {'-h', '--help'}: 79 temp.help_memo() 80 elif sys.argv[1] in {'-u ', '--url'} and sys.argv[3] in {'-t', '--txt'}: 81 a = temp.download(sys.argv[2]) 82 temp.write(a, sys.argv[4]) 83 elif sys.argv[1] in {'-t', '--txt'}: 84 print('请先输入url!') 85 elif sys.argv[1] in {'-u', '--url'}: 86 url_list = sys.argv[2:] 87 print(url_list) 88 for i in url_list: 89 a = p.apply_async(temp.download, args=(i,)) 90 p_list.append(a) 91 for p in p_list: 92 print(p.get()) 93 else: 94 temp.help_memo() 95 96 print('输入的参数有误!') 97 except Exception as e: 98 print(e) 99 temp.help_memo() 100 101 102 if __name__ == '__main__': 103 main()
来源:https://www.cnblogs.com/riyir/p/12460042.html