1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
| import re import json import time import jieba import requests import numpy as np from urllib.parse import quote from wordcloud import WordCloud from bs4 import BeautifulSoup import matplotlib.pyplot as plt
class DoubanCrawl: def __init__(self, info_type): self.info_type = info_type self.headers = [ { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'Host': 'movie.douban.com' }, { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'Host': 'book.douban.com' } ] self.movie_search_url = "https://movie.douban.com/j/subject_suggest?q=" self.movie_url = "https://movie.douban.com/subject/%s/" self.movie_comment_url = "https://movie.douban.com/subject/%s/comments?start=%d&limit=20&sort=new_score&status=P"
self.book_search_url = "https://book.douban.com/j/subject_suggest?q=" self.book_url = "https://book.douban.com/subject/%s/" self.book_comment_url = "https://book.douban.com/subject/%s/comments/hot?p=%d"
def info_crawl(self, name, bg_image=None): name_str = self.__handle_name(name) text_list = [] if self.info_type == "movie": print("-----爬取电影短评-----") self.movie_search_url += name_str self.movie_url, num_str = self.__find_url(self.movie_search_url, 0) for i in range(0, 15): url = self.movie_comment_url % (num_str, i*20) time.sleep(np.random.randint(1, 3)) print("正在获取第%d个页面" % i) r = requests.get(url, headers=self.headers[0]) soup = BeautifulSoup(r.content, 'lxml') comment_list = soup.find_all('span', class_='short') for ct in comment_list: text_list.append(ct.text) self.__comment_to_txt(name, text_list) self.__plot_wordcloud(name, bg_image) else: print("-----爬取书籍短评-----") self.book_search_url += name_str self.book_url, num_str = self.__find_url(self.book_search_url, 1) for i in range(1, 20): url = self.book_comment_url % (num_str, i) time.sleep(np.random.randint(1, 3)) print("正在获取第%d个页面" % i) r = requests.get(url, headers=self.headers[1]) soup = BeautifulSoup(r.content, 'lxml') comment_list = soup.find_all('span', class_='short') for ct in comment_list: text_list.append(ct.text) self.__comment_to_txt(name, text_list) self.__plot_wordcloud(name, bg_image)
def __plot_wordcloud(self, name, bg_image=None): """ 绘制词云 :param name: :param bg_image: :return: """ file_name = str(name) + '.txt' f = open(file_name, 'r', encoding='utf-8').read() cut_text = " ".join(jieba.cut(f)) print("正在生成词云...") word_cloud = WordCloud( scale=10, font_path='C:/Windows/Fonts/simfang.ttf', background_color="white", width=1000, height=1000 ).generate(cut_text) plt.imshow(word_cloud, interpolation='bilinear') plt.axis('off') plt.show()
def __comment_to_txt(self, name, clist): file_name = str(name) + '.txt' with open(file_name, 'a+', encoding='utf-8') as f: for c in clist: f.write(c) f.close()
def __handle_name(self, name): """ 编码中文关键字 :param name: :return: """ return str(quote(name))
def __find_url(self, url, tp): """ 获取真实主页地址和编号id :param url: :param tp: :return: """ r = requests.get(url, headers=self.headers[tp]) json_data = json.loads(r.text) address_num = re.search('[0-9]+', json_data[0]['url']) if tp == 0: return self.movie_url % address_num, address_num.group(0) else: return self.book_url % address_num, address_num.group(0)
if __name__ == '__main__': my_crawl = DoubanCrawl("movie") my_crawl.info_crawl('千与千寻')
|