1、requests基本使用
1.1、requests介绍
requests是python中一个常用于发送http请求的第三方库,它极大地简化了web服务交互的过程。它是唯一的一个非转基因的python http库,人类可以安全享用。
1.2、requests库的安装
pip install -i https://pypi.tuan.tsinghua.edu.cn/simple requests
1.3、requests基础语法
import requests url = 'http://www.baidu.com' response = requests.get(url)
1.4、response的属性以及类型
(1)一个类型:
print(type(response)) # <class 'requests.models.response'>
(2)六个属性:
# 是指相应的编码格式 response.encoding = 'utf-8' # 以字符串形式返回网页源码 print(response.text) # 获取请求头 print(response.url) # 返回二进制数据 print(response.content) # 返回状态码信息 print(response.status_code) # 获取响应头信息 print(response.headers)
2、requests的get请求
爬取郑州页面信息,和urllib基本差不多,只要明白urllib,相信requests的get请求也不会有什么难度。
import requests
url = 'https://www.baidu.com/s?'
headers = {
"user-agent":
"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/128.0.0.0 safari/537.36"
}
data = {
"wd":"郑州"
}
# url 请求资源路径 params 参数 # kwargs 字典
response = requests.get(url=url,params=data,headers=headers)
content = response.text
print(content)与urllib的get请求区别:
1、参数需要使用params传递
2、参数无需urlencode3、不需要请求对象的定制
4、请求资源路径中的?可以省略
3、requests的post请求
我们还是以之前urllib中关于post请求-百度翻译为例:
import requests
url = "https://fanyi.baidu.com/sug"
headers = {
"user-agent":
"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/128.0.0.0 safari/537.36",
"cookie":'bidupsid=91ac5a2a82e26f50448a070917943e70; pstm=1732629509; baiduid=91ac5a2a82e26f50448a070917943e70:fg=1; bduss_bfess=e1icjz0nvrodglnnjjanfdxnuzqvjvsze04ew5iavdosxkzq3bdrkcxvndmbkpurufbqufbjcqaaaaaaqaaaaeaaabyamgfaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaahchsmdwoupne; baiduid_bfess=91ac5a2a82e26f50448a070917943e70:fg=1; zfy=0l:brfxmz3oppsil2wrbinbmdk4f2ndwqtl:bfl6za7pm:c; bdrcvfr[l9-imhu-bdf]=mk3slvn4hkm; delper=0; h_ps_pssid=61027_61099_61217_61280_61298_61246_60853; bdrcvfr[fewj1vr5u3d]=i67x6tjhwwyf0; bdorz=fffb88e999055a3f8a630c64834bd6d0; h_wise_sids=61027_61099_61217_61280_61298_61246_60853; psino=1; ba_hector=a58l2h24a121a1808ka48g213kh3u01jlb88s1u; bclid=10763796247062205483; bclid_bfess=10763796247062205483; bdsfrcvid=rvfojexrog3b_xqjosadbcbkxuweg7btdyreowxpsp3lgjlvdle8eg0pts1-deu-s2ooogkkbeothn0f_2uxojjg8utvjec6eg0ptf8g0m5; bdsfrcvid_bfess=rvfojexrog3b_xqjosadbcbkxuweg7btdyreowxpsp3lgjlvdle8eg0pts1-deu-s2ooogkkbeothn0f_2uxojjg8utvjec6eg0ptf8g0m5; h_bdclckid_sf=tbkd_c-mfivhdrtvhccjh-fsmgtbki62akdsoj71bhcqj-ovqpjmju4byrnkbjoa0krihn6cwkjj8ubewfvp3t_d-tuh3llhqjnph66dah5nhmjmbp_vhfl3qtcoajby523i5j5vqpn_hhq3drowxpiqbn7p-p5z5maqkl0mlpbtbb0xxj_0dtbljh8jqtntad5ywj6jantjjtrfbktjhprml4tjwmt-mtrykm3xjh7-ox7xy4ndlpduwmcib5ombanrhlrnqrjvhqi4lq_k360zwec72mqxtnrjmmkeal5mkqf9mrjobupulxo9luvxtgcdot5ybbc8eina5hjkbfjbqttjqn3hfikj2cklfc-amct6eno_mt4hqfbqa4jwhdqbsjooacvdsqqoy4otj6d05-trbmrzxa5zaronkqviep8rw4r_3mvb-fnykmijye3cbitbtbr5ol6kqft20-daemtjbbllfnttvn7jwhvieq72y-i2qlrx5q79attmfntj-qch0kqpsijm5-dwbt8ejhcdj5kdtjuhvbobhjohjjbgq4bohjpx54j9btqo-doxoho7mujkdpoqb-5t-xpr5qj-05baqgnkqq5vbmnmqptrxmjkxhkox-_o0x-jltneo66e34kvviooxpnjyupybtnnbpcj3h8hl4nv2jcjbm5m3x6qltkkqn3t-pko5bru_ccj-j8xmd89jtbp; h_bdclckid_sf_bfess=tbkd_c-mfivhdrtvhccjh-fsmgtbki62akdsoj71bhcqj-ovqpjmju4byrnkbjoa0krihn6cwkjj8ubewfvp3t_d-tuh3llhqjnph66dah5nhmjmbp_vhfl3qtcoajby523i5j5vqpn_hhq3drowxpiqbn7p-p5z5maqkl0mlpbtbb0xxj_0dtbljh8jqtntad5ywj6jantjjtrfbktjhprml4tjwmt-mtrykm3xjh7-ox7xy4ndlpduwmcib5ombanrhlrnqrjvhqi4lq_k360zwec72mqxtnrjmmkeal5mkqf9mrjobupulxo9luvxtgcdot5ybbc8eina5hjkbfjbqttjqn3hfikj2cklfc-amct6eno_mt4hqfbqa4jwhdqbsjooacvdsqqoy4otj6d05-trbmrzxa5zaronkqviep8rw4r_3mvb-fnykmijye3cbitbtbr5ol6kqft20-daemtjbbllfnttvn7jwhvieq72y-i2qlrx5q79attmfntj-qch0kqpsijm5-dwbt8ejhcdj5kdtjuhvbobhjohjjbgq4bohjpx54j9btqo-doxoho7mujkdpoqb-5t-xpr5qj-05baqgnkqq5vbmnmqptrxmjkxhkox-_o0x-jltneo66e34kvviooxpnjyupybtnnbpcj3h8hl4nv2jcjbm5m3x6qltkkqn3t-pko5bru_ccj-j8xmd89jtbp; ab_sr=1.0.1_zmq5mtq5yzbmngjknty1nzmwmdmyzdljndi4zdnmndk2yjbiotjiotkyntywzdewywm1mtayndlim2iwzjqxnmfmymqxzgjmzdi0mdi5ymvizdiwyziwmdvkzmmxnjljngeznzq5mtyyowy5mzvmmtgxztqxogy4yzfhmtk3ywringq0ngi3y2m1njhjogeymte1mdu1n2m1mdi2owvjmg==; rt="z=1&dm=baidu.com&si=683d19d9-ec4a-4ee1-ba25-d45da6aaef7f&ss=m4fnfeoj&sl=3&tt=b6o&bcn=https%3a%2f%2ffclog.baidu.com%2flog%2fweirwood%3ftype%3dperf&ld=ruw"'
}
data = {
"kw":"eye"
}
response = requests.post(url=url, headers=headers, data=data)
content = response.text
import json
content = json.loads(content)
print(content)与urllib的post请求的区别:
1、post请求不需要编解码
2、post请求的参数是data
3、不需要请求对象的定制
4、代理
import requests
url = "http://www.baidu.com/s?"
headers = {
# "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"user-agent":
"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/128.0.0.0 safari/537.36",
# "cookie":'bidupsid=91ac5a2a82e26f50448a070917943e70; pstm=1732629509; baiduid=91ac5a2a82e26f50448a070917943e70:fg=1; bd_upn=12314753; bduss_bfess=e1icjz0nvrodglnnjjanfdxnuzqvjvsze04ew5iavdosxkzq3bdrkcxvndmbkpurufbqufbjcqaaaaaaqaaaaeaaabyamgfaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaahchsmdwoupne; baiduid_bfess=91ac5a2a82e26f50448a070917943e70:fg=1; zfy=0l:brfxmz3oppsil2wrbinbmdk4f2ndwqtl:bfl6za7pm:c; b64_bot=1; bdrcvfr[l9-imhu-bdf]=mk3slvn4hkm; delper=0; bd_ck_sam=1; h_ps_pssid=61027_61099_61217_61280_61298_61246_60853; shifen[8451320_53724]=1733557849; shifen[304792146112_6039]=1733557876; bdrcvfr[fewj1vr5u3d]=i67x6tjhwwyf0; bdorz=fffb88e999055a3f8a630c64834bd6d0; h_wise_sids=61027_61099_61217_61280_61298_61246_60853; ba_hector=a58l2h24a121a1808ka48g213kh3u01jlb88s1u; shifen[8332037_91638]=1733665082; bclid=10763796247062205483; bclid_bfess=10763796247062205483; bdsfrcvid=rvfojexrog3b_xqjosadbcbkxuweg7btdyreowxpsp3lgjlvdle8eg0pts1-deu-s2ooogkkbeothn0f_2uxojjg8utvjec6eg0ptf8g0m5; bdsfrcvid_bfess=rvfojexrog3b_xqjosadbcbkxuweg7btdyreowxpsp3lgjlvdle8eg0pts1-deu-s2ooogkkbeothn0f_2uxojjg8utvjec6eg0ptf8g0m5; h_bdclckid_sf=tbkd_c-mfivhdrtvhccjh-fsmgtbki62akdsoj71bhcqj-ovqpjmju4byrnkbjoa0krihn6cwkjj8ubewfvp3t_d-tuh3llhqjnph66dah5nhmjmbp_vhfl3qtcoajby523i5j5vqpn_hhq3drowxpiqbn7p-p5z5maqkl0mlpbtbb0xxj_0dtbljh8jqtntad5ywj6jantjjtrfbktjhprml4tjwmt-mtrykm3xjh7-ox7xy4ndlpduwmcib5ombanrhlrnqrjvhqi4lq_k360zwec72mqxtnrjmmkeal5mkqf9mrjobupulxo9luvxtgcdot5ybbc8eina5hjkbfjbqttjqn3hfikj2cklfc-amct6eno_mt4hqfbqa4jwhdqbsjooacvdsqqoy4otj6d05-trbmrzxa5zaronkqviep8rw4r_3mvb-fnykmijye3cbitbtbr5ol6kqft20-daemtjbbllfnttvn7jwhvieq72y-i2qlrx5q79attmfntj-qch0kqpsijm5-dwbt8ejhcdj5kdtjuhvbobhjohjjbgq4bohjpx54j9btqo-doxoho7mujkdpoqb-5t-xpr5qj-05baqgnkqq5vbmnmqptrxmjkxhkox-_o0x-jltneo66e34kvviooxpnjyupybtnnbpcj3h8hl4nv2jcjbm5m3x6qltkkqn3t-pko5bru_ccj-j8xmd89jtbp; h_bdclckid_sf_bfess=tbkd_c-mfivhdrtvhccjh-fsmgtbki62akdsoj71bhcqj-ovqpjmju4byrnkbjoa0krihn6cwkjj8ubewfvp3t_d-tuh3llhqjnph66dah5nhmjmbp_vhfl3qtcoajby523i5j5vqpn_hhq3drowxpiqbn7p-p5z5maqkl0mlpbtbb0xxj_0dtbljh8jqtntad5ywj6jantjjtrfbktjhprml4tjwmt-mtrykm3xjh7-ox7xy4ndlpduwmcib5ombanrhlrnqrjvhqi4lq_k360zwec72mqxtnrjmmkeal5mkqf9mrjobupulxo9luvxtgcdot5ybbc8eina5hjkbfjbqttjqn3hfikj2cklfc-amct6eno_mt4hqfbqa4jwhdqbsjooacvdsqqoy4otj6d05-trbmrzxa5zaronkqviep8rw4r_3mvb-fnykmijye3cbitbtbr5ol6kqft20-daemtjbbllfnttvn7jwhvieq72y-i2qlrx5q79attmfntj-qch0kqpsijm5-dwbt8ejhcdj5kdtjuhvbobhjohjjbgq4bohjpx54j9btqo-doxoho7mujkdpoqb-5t-xpr5qj-05baqgnkqq5vbmnmqptrxmjkxhkox-_o0x-jltneo66e34kvviooxpnjyupybtnnbpcj3h8hl4nv2jcjbm5m3x6qltkkqn3t-pko5bru_ccj-j8xmd89jtbp; ab_sr=1.0.1_zmq5mtq5yzbmngjknty1nzmwmdmyzdljndi4zdnmndk2yjbiotjiotkyntywzdewywm1mtayndlim2iwzjqxnmfmymqxzgjmzdi0mdi5ymvizdiwyziwmdvkzmmxnjljngeznzq5mtyyowy5mzvmmtgxztqxogy4yzfhmtk3ywringq0ngi3y2m1njhjogeymte1mdu1n2m1mdi2owvjmg==; rt="z=1&dm=baidu.com&si=683d19d9-ec4a-4ee1-ba25-d45da6aaef7f&ss=m4fnfeoj&sl=4&tt=cn1&bcn=https%3a%2f%2ffclog.baidu.com%2flog%2fweirwood%3ftype%3dperf&ld=wmj&ul=o4bd&hd=o4c0"; psino=7; sugstore=1; h_ps_645ec=e2c20yk9roanwfivydjbr18jc5dzoznojiuapy0jxsxtszcoksks5n3iuyetiadn7vsq5zy; baikevisitid=1d823dea-39eb-4e63-978d-65fd09a0d697; cookie_session=81376_0_6_6_7_3_1_0_6_3_205_1_111167_0_0_0_1733584849_0_1733666222%7c9%2379969_3_1733137574%7c2'
}
data = {
"wd":"ip"
}
# 代理池
proxy={
"http":"23.247.137.142:80"
}
response =requests.get(url=url,params=data,headers=headers,proxies=proxy)
content = response.text
file = open("ip.html","w",encoding="utf-8")
file.write(content)
file.close()5、cookie登录
我们以古诗文个人主页页面为例子,含有验证码。

首先我们进入登陆界面后,搜遍输入密码,然后打开开发者模式,看到login接口,看负载(payload)里面有许多信息。
__viewstate:mntnh2sbi9ishx8zdfu1nvmbyzxosvf8vxj5qiej5c8emgwhabfqrnjqyme47e+qoo+ss1lsdndjyenry/bdvd7wktgbmm73cku21k7nhlmyo79cc54kuz//cz9kslkkfvkpppzossnyet3gx789uh1dmum= __viewstategenerator: c93be1ae
这两个信息不固定,是变量,而code也是变量。因此解决这三个变量就是这个例子的难点
难点:(1)__viewstate __viewstategenerator

我们回到登陆页面,检查源代码,发现里面是有这两个变量的。而hidden我们称之为隐藏域。
获取登录页面源码:
import requests
url = "https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx"
headers = {
"user-agent":"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/128.0.0.0 safari/537.36"
}
response = requests.get(url, headers=headers)
content = response.text解析__viewstate __viewstategenerator两个变量的value,可以通过beautifulsoup语法,也可用通过xpath:
from lxml import etree
tree = etree.html(content)
__viewstate = tree.xpath('//input[@name="__viewstate"]/@value')
__viewstategenerator = tree.xpath('//input[@name="__viewstategenerator"]/@value')
print(__viewstate)
print(__viewstategenerator)难点:(2)code验证码(获取验证码图片)
code = tree.xpath('//img[@id="imgcode"]/@src')[0]
code_url = "https://so.gushiwen.cn"+code获取了验证码图片后下载到本地观察验证码,然后在控制台输入即可!(当然也可以用pytesseract来识别数字)
import urllib.request
urllib.request.urlretrieve(url=code_url,filename="code.jpg")
code_name = input("请输入验证码:")但这种方法显然是有问题的,只有我们输入验证码后才会生成新的验证码,也就是说这个时候我们输入的验证码是旧的验证码。因此我们可以用requests库中的session方法,通过session的返回值,是请求变成一个对象。
session = requests.session()
response_code = session.get(code_url)
content_code = response_code.content # 此时要使用二进制数据,因为使用的图片的下载
f = open("code.jpg","wb") # wb的模式就是将二进制数据写入到文件
f.write(content_code)
f.close()
code_name = input("请输入验证码:")抓取登录按钮的接口
url_post = "https://www.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fwww.gushiwen.cn%2fuser%2fcollect.aspx"
data_post = {
"__viewstate": viewstate,
"__viewstategenerator": viewstategenerator,
"from": "http://www.gushiwen.cn/user/collect.aspx",
"email": 17719114890,
"pwd": "dwq0219423",
"code": code_name,
"denglu": "登录"
}
response_post = session.post(url=url_post, headers=headers, data=data_post)
content_post = response_post.text
f = open("古诗文.html","w",encoding="utf-8")
f.write(content_post)完整代码如下:
import requests
url = "https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx"
headers = {
"user-agent":"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/128.0.0.0 safari/537.36"
}
response = requests.get(url, headers=headers)
content = response.text
from lxml import etree
tree = etree.html(content)
viewstate = tree.xpath('//input[@name="__viewstate"]/@value')[0]
viewstategenerator = tree.xpath('//input[@name="__viewstategenerator"]/@value')[0]
code = tree.xpath('//img[@id="imgcode"]/@src')[0]
code_url = "https://so.gushiwen.cn"+code
session = requests.session()
response_code = session.get(code_url)
content_code = response_code.content # 此时要使用二进制数据,因为使用的图片的下载
f = open("code.jpg","wb") # wb的模式就是将二进制数据写入到文件
f.write(content_code)
f.close()
code_name = input("请输入验证码:")
url_post = "https://www.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fwww.gushiwen.cn%2fuser%2fcollect.aspx"
data_post = {
"__viewstate": viewstate,
"__viewstategenerator": viewstategenerator,
"from": "http://www.gushiwen.cn/user/collect.aspx",
"email": 17719114890,
"pwd": "dwq0219423",
"code": code_name,
"denglu": "登录"
}
response_post = session.post(url=url_post, headers=headers, data=data_post)
content_post = response_post.text
f = open("古诗文.html","w",encoding="utf-8")
f.write(content_post)
发表评论