0%

Python爬虫简要总结

类型一: 页面上可以直接抓取到内容

此类爬虫比较简单,一般直接使用requests得到网页的内容后,正则匹配出要抓取的内容即可

1
2
3
4
5
6
7
8
9
10
11
import re
import requests

url = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431608990315a01b575e2ab041168ff0df194698afac000'
res = requests.get(url)
result = re.findall('img src="(\S+)"', res.text)
for i in result:
if not i.startswith("http"):
print("https://www.liaoxuefeng.com" + i)
else:
print(i)

类型二:页面上可以看到要抓取的内容,但是直接用网络请求抓取不到

此类一般是因为要抓取的内容是页面通过js动态添加上去的,要去分析浏览器请求这个页面的时候还做了哪些请求

1
2
3
4
5
import requests

url = "https://unsplash.com/"
res = requests.get(url)
print(res.text)

另一个需要登录的例子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
url = "http://www.53iq.com/device/user/login"
res = requests.get(url)
s = res.headers['Set-Cookie']
s1 = re.findall(r'(sessionid=\S+)', s)[0]
s2 = re.findall(r'(csrftoken=\S+)', s)[0]
# print(s1, s2)

payload = {
'csrfmiddlewaretoken': s2,
'username': 'XXX',
'password': 'XXXXXX'
}
cookie = s1 + ' ' + s2 + ' Hm_lvt_854b60cb774bd0cf1ab148e604cfd819=1471424118,1471575858,1471676503,1471687362; Hm_lpvt_854b60cb774bd0cf1ab148e604cfd819=1471687362'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '87',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': cookie,
'DNT': '1',
'Host': 'www.53iq.com',
'Origin': 'http://www.53iq.com',
'Pragma': 'no-cache',
'Referer': 'http://www.53iq.com/device/user/login',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
res = requests.post(url, data=payload, headers=headers)

headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':cookie,
'DNT':'1',
'Host':'www.53iq.com',
'Pragma':'no-cache',
'Referer':'http://www.53iq.com/device/device/new_device',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.366',
'X-Requested-With':'XMLHttpRequest'
}

url = "http://www.53iq.com/device/device/get_new_device"
params = {
'txtFaName': '海尔',
'page': '1',
'limit': '20'
}
res = requests.get(url, params=params, headers=headers)

print(res.json())

简化版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

url = "http://www.53iq.com/device/user/login"
s = requests.session()
s.get(url)
s.post(url, data={
"csrfmiddlewaretoken": s.cookies.get("csrftoken"),
'username': 'XXX',
'password': 'XXXXXX'
})

url = "http://www.53iq.com/device/device/get_new_device"
params = {
'txtFaName': '海尔',
'page': '1',
'limit': '20'
}
res = s.get(url, params=params)
print(res.json())

爬虫利器 — 网页解析库BeautifulSoup

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests
import time
from bs4 import BeautifulSoup

url = 'http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'

def download_page(url):
data = requests.get(url).text
return data

def parse_html(html, URL):
print(URL + ' is under search...')
iid = URL.split('/')[-1]
if 'Sublime Text' in html:
print("FIND!!!!!!" + URL)
return None
soup = BeautifulSoup(html, "html.parser")
next_page = soup.find('li', attrs={'id': iid}).find_next('li').find('a')
time.sleep(0.5)
if next_page:
print(next_page.get_text())
return "http://www.liaoxuefeng.com" + next_page['href']
return None


def main(url):
html = download_page(url)
Url = parse_html(html, url)
while Url:
html = download_page(Url)
Url = parse_html(html, Url)

if __name__ == "__main__":
main(url)

爬虫框架 — Scrapy