Python爬虫学习之百度相关搜索关键词

这个是运营的需求,想要了解要查找的词有哪些相关搜索词及统计出现的次数并导出到CSV文件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: xo9
# Date: 2020.01.19

import cloudscraper
import random
import time
import csv
from urllib import parse
from bs4 import BeautifulSoup as bs

# 以搜索词为键,存放统计次数
words = {}
# 以搜索词为键,标志是否已搜索过
results = {}
scraper = cloudscraper.create_scraper(browser='chrome')

def baidu_search(keyword):
print("正在查找 \033[1;32m%s\033[0m 的相关搜索词" %keyword)
# 对搜索词进行url编码
url = "https://www.baidu.com/s?ie=utf-8&wd=%s" % parse.quote_plus(keyword)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
headers['Referer'] = url
req = scraper.get(url)
soup = bs(req.text, "lxml")
element = soup.select("#rs a")
for i in element:
item = i.get_text().strip()
print(item)
# 如果不存在,添加值
if item not in words:
words[item] = 1
result[item] = 0
else:
# 否则,次数+1
words[item] = words.get(item) + 1
print()

if __name__ == '__main__':
keyword = ''
level = ''
while not keyword:
keyword = input('请输入搜索词:'.strip())
while not level.isdigit():
level = input('递归层数:'.strip())
baidu_search(keyword)
i = 1
while i < int(level):
tmp_results = list(results)
for record in tmp_results:
if results[record] == 0:
baidu_search(record)
results[record] = 1
time.sleep(random.randint(2,5))
i = i + 1

# 将存放在words字典里的结果导出到csv文件
with open(keyword + '.csv', 'w', newline = '') as f:
w = csv.writer(f)
w.writerows(words.items())

运行结果如下:

一分、两分都是爱!