Python爬虫

1、爬取豆瓣Top250电影

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#------------------------------------
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @FileName : 豆瓣Top250.py
# @Author : 嘿嘿嘿
# @Time : 2020/7/19 0:11
# @Software : PyCharm
#------------------------------------

from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #指定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行数据库操作

def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
savepath = ".\\豆瓣电影Top250.xls"
saveDate(datalist, savepath)

# 获取网页
def getData(baseurl):
datalist = []
for i in range(0, 10):
url = baseurl + str(i*25)
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div',class_="item"):
data = []
item = str(item)
id = re.findall('<em class="">(.*)</em>', item)[0]
title = re.findall(r'<span class="title">(.*)</span>', item)[0]
link = re.findall(r'<a href="(.*)">', item)[0]
rating_num = re.findall(r'<span class="rating_num" property="v:average">(.*)</span>', item)[0]
people = re.findall(r'<span>(.*人评价)</span>', item)[0]
data.append(id)
data.append(title)
data.append(rating_num)
data.append(link)
data.append(people)
datalist.append(data)
print(datalist)
return datalist

def askURL(url):
head = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
#print(html)
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html

# 保存数据
def saveDate(datalist, savepath):
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
col = ("电影排名", "电影名称", "电影评分", "评价人数")
for i in range(0, 4):
worksheet.write(0, i, col[i])
for i in range(0,250):
for j in range(0,5):
worksheet.write(i+1,j,datalist[i][j])
workbook.save('豆瓣Top250.xls')

if __name__ == "__main__":

2、爬取江西高考投档线

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#------------------------------------
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @FileName : 江西高考投档线.py
# @Author : 嘿嘿嘿
# @Time : 2020/7/19 14:49
# @Software : PyCharm
#------------------------------------
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #指定URL,获取网页数据
import xlwt #进行excel操作

def main():
baseurl = "https://www.dxsbb.com/news/58588.html"
datalist = getData(baseurl)
saveDate(datalist)

# 获取网页
def getData(baseurl):
title_col = [['序号'], ['科类'], ['院校代号'], ['院校名称'], ['投档线'], ['最低投档排名']]
datas = ['']
datalist = []
html = askURL(baseurl)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('td', style="text-align:center;"):
item = str(item).strip()
data = re.findall(r'<td style="text-align:center;">(.*)</td>', item)
data = str(data)
data = re.sub(r"<a class=.keyWord. href=.../news/list_...?.html. target=._blank.>", '', data)
data = re.sub(r"</a>", '', data)
data = str(data).strip("['")
data = str(data).strip("']")
print(data)
datas.append(data)
if len(datas)%6 == 0:
datas = []
datalist.append(datas)
datalist[0] = title_col
datalist.pop()
return datalist

def askURL(url):
head = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode('gbk')
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html

# 保存数据
def saveDate(datalist):
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
for i in range(0, len(datalist)):
for j in range(0, 6):
worksheet.write(i, j, datalist[i][j])
workbook.save('2019江西高考二本投档线.xls')

if __name__ == "__main__":