regex 正则表达式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 获取豆瓣前 250 的电影信息,包括名字,评分和电影 slogan。
# 主要是使用了正则表达式 .*? 进行内容忽略和 (?P<group_name>.*?) 将获取的 html 信息进行提取

import requests
import re

import WebConstant


pattern = r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?<span class="rating_num" property="v:average">(?P<rate>.*?)</span>.*?<span class="inq">(?P<slogan>.*?)</span>.*?</li>'
regex = re.compile(pattern, re.S)

movie_list = []


for i in range(10):
url = "https://movie.douban.com/top250?start="+str(i*25)
response = requests.get(url, headers={'User-Agent': WebConstant.USER_AGNET})
response.encoding = "utf-8"
text = response.text
groups = regex.finditer(text)
for i in groups:
movie_info = {}
movie_info["name"] = i.group("name")
movie_info["rate"] = i.group("rate")
movie_info["slogan"] = i.group("slogan")
movie_list.append(movie_info)

print(movie_list)

Xpath 路径定位

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 主要是记录 xpath 的使用过程。
# 通过 xpath 实现对八戒网的店铺、价格、地理位置信息进行提取。

import requests
from lxml import etree

import WebConstant

url = "https://chengdu.zbj.com/search/f/?kw=logo"
headers = {
'User-Agent': WebConstant.USER_AGNET
}


response = requests.get(url, headers=headers)
response.encoding = 'utf-8'

html = etree.HTML(response.text)
etree.parse()

divs = html.xpath('//*[@id="utopia_widget_76"]/a[1]/div')

for div in divs:
print(div)
price = div.xpath('./p/text()')
print(price)

Session 登录会话

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 获取  17k 小说 登录之后的书架内容
# 通过 Session 实现登陆后自动发送 Cookie,实现登录内容获取

import requests
import WebConstant

login_url = 'https://passport.17k.com/ck/user/login' #登录
collected_books_url = 'https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919' # 返回收藏书 JSON 数据的接口

session = requests.session()

headers = {
WebConstant.USER_AGNET_KEY: WebConstant.USER_AGNET
}

params = {
'loginName': 'BookLoverTempest',
'password': 'tempest&2001618'
}

session.post(login_url, params=params, headers=headers)

collected_books_dic = session.get(collected_books_url).json() # JSON 转 dic

for collected_book in collected_books_dic['data']:
print(collected_book)

防盗链处理

一些视频的 URL 地址并不是完全显示在 JSON 中的,可能会 JS 进行加工处理,所以需要找找 JSON 和实际地址之间的关系,进行处理。

多线程/多进程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# 对 北京新发地 菜市场价格信息进行爬取
# 采用 进程池 进行多进程爬取加速(爬虫还是推荐 Asyncio 或者线程池)
import time

import requests
import WebConstant
from concurrent.futures import ProcessPoolExecutor

url = "http://www.xinfadi.com.cn/getPriceData.html"

params = {
'current': 1
}

headers = {
WebConstant.USER_AGNET_KEY: WebConstant.USER_AGNET
}


def getItemPrice(current: int) -> list:
params['current'] = current
with requests.get(url, params=params, headers=headers) as response:
return response.json()['list']


if __name__ == '__main__':
items = []
executors = []
start = time.time()
with ProcessPoolExecutor(12) as p: # 开 12 个进程的进程池 6s
for i in range(101):
executors.append(p.submit(getItemPrice, (i + 1)))

for executor in executors: # 等多进程分配执行完了,再去获取结果
items.append(executor.result().copy())

# for i in range(101): # 顺序执行 44s
# getItemPrice(i+1)

print(items)
print(len(items))
print(time.time() - start)

Asyncio 异步编程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# 使用异步的网络请求 aiohttp 完成对网络小说 斗破苍穹的爬取

import asyncio
import aiohttp
import re
import requests
import WebConstant
from lxml import etree


class Script:
url = 'https://www.ddyueshu.com'

headers = {
WebConstant.USER_AGNET_KEY: WebConstant.USER_AGNET
}

def __init__(self):
chapters = self.getChapters()
loop = asyncio.get_event_loop()
loop.run_until_complete(self.toDownload(chapters))
pass

def getChapters(self) -> list:
chapters = []

response = requests.get(Script.url + '/1_1600', headers=Script.headers)
response.encoding = "gbk"
pattern = r'.*?<dd><a href ="(?P<url>.*?)">(?P<name>.*?)</a></dd>.*?'
regex = re.compile(pattern, re.S)
groups = regex.finditer(response.text)

for i in groups:
chapter = {'name': i.group("name"), 'url': i.group("url")}
chapters.append(chapter)

return chapters

async def toDownload(self, chapters: list):
tasks = []
for chapter in chapters:
task = asyncio.create_task(self.download(chapter['url'], chapter['name']))
tasks.append(task)

await asyncio.wait(tasks)

async def download(self, url: str, chapterName: str):
url = Script.url + url

async with aiohttp.ClientSession() as session:
async with session.get(url, headers=Script.headers) as response:
content = await response.text(encoding='gbk')
html = etree.HTML(content)
text = html.xpath('//div[@id="content"]/text()')
with open(f"./斗破苍穹/{chapterName}.txt", "w", encoding='utf-8') as f:
f.write(str(text))


if __name__ == '__main__':
Script()

selenium 使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import time

from selenium.webdriver import Edge
from selenium.webdriver.common.keys import Keys

browser = Edge() # 创建浏览器对象
browser.get("https://www.lagou.com/") # 访问目标网站

time.sleep(1)
browser.find_element_by_xpath("/html/body/div[10]/div[1]/div[2]/div[2]/div[1]/div/p[1]/a").click() # 定位并点击

time.sleep(1)
# send_keys 进行键盘输入搜索内容和回车
browser.find_element_by_xpath("/html/body/div[7]/div[1]/div[1]/div[1]/form/input[1]").send_keys("Java",Keys.ENTER)

time.sleep(1)
# 定位所有搜到的结果
elements = browser.find_elements_by_xpath('//*[@id="s_position_list"]/ul/li')

# 打开所有搜到的元素
for element in elements:
element.find_element_by_xpath('./div[1]/div[1]/div[1]/a').click()


time.sleep(2)
# 获取搜到的内容,然后关闭页面
for element in elements:
browser.switch_to.window(browser.window_handles[-1])
print(browser.find_element_by_xpath("/html/body/div[1]/div[1]/div[4]/div[1]/dl[1]/dd[2]/div").text)
browser.close()

time.sleep(3)
browser.close()