import urllib.request import urllib.error from bs4 import BeautifulSoup import re
defaskurl(url): head = { # 模拟浏览器头部信息,这段我抄的https://blog.csdn.net/bookssea/article/details/107309591 "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" }
request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: ifhasattr(e, "code"): print(e.code) ifhasattr(e, "reason"): print(e.reason)
# print("URL内容请求成功") return html
defget_page(): # 取前10页 urls = ['https://bgm.tv/anime/browser?sort=rank&page={}'.format(str(i)) for i inrange(1, 277)] # 输出验证 # print(urls) return urls
defclearblankline(): global i withopen('1.txt', 'r', encoding='utf-8') as fr, open('bgm排行榜.txt', 'w', encoding='utf-8') as fd: for text in fr.readlines(): if text.split(): fd.write(text) i += 1 if i % 4 == 0: fd.write('\n') print('输出成功....')
defshow(): global a print(a) a += 1
if __name__ == "__main__": a = 1 i = 0 for url in get_page(): html = askurl(url) # 保存获取到的网页源码 bs = BeautifulSoup(html, 'html.parser')
for name in bs.find_all('li', class_=re.compile('(item )(odd|even)( clearit)')): # print(name.get_text()) txtfile = open("1.txt", 'a', encoding='utf-8') txtfile.write(name.get_text()) txtfile.close() show() print("爬取完成") clearblankline()