# 获取网页 def getData(baseurl): datalist = [] for i in range(0, 10): url = baseurl + str(i*25) html = askURL(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div',class_="item"): data = [] item = str(item) id = re.findall('<em class="">(.*)</em>', item)[0] title = re.findall(r'<span class="title">(.*)</span>', item)[0] link = re.findall(r'<a href="(.*)">', item)[0] rating_num = re.findall(r'<span class="rating_num" property="v:average">(.*)</span>', item)[0] people = re.findall(r'<span>(.*人评价)</span>', item)[0] data.append(id) data.append(title) data.append(rating_num) data.append(link) data.append(people) datalist.append(data) print(datalist) return datalist
def askURL(url): head = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = '' try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') #print(html) except urllib.error.URLError as e: if hasattr(e, 'code'): print(e.code) if hasattr(e, 'reason'): print(e.reason) return html
# 保存数据 def saveDate(datalist, savepath): workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) col = ("电影排名", "电影名称", "电影评分", "评价人数") for i in range(0, 4): worksheet.write(0, i, col[i]) for i in range(0,250): for j in range(0,5): worksheet.write(i+1,j,datalist[i][j]) workbook.save('豆瓣Top250.xls')