python3 用cookie模拟登录,爬取表格中的数据
需求: 1)爬取网页中有三百多张表格 2)存储到excel中第一行为表名,后面为爬取的数据,最后一行加空行,爬取到的数据都放在同一个工作簿中。
思路:简单登录无验证码,先用账号密码登录,得到cookie并保存到本地,访问目标网站时带上cookie
LOGIN_URL:登录网站 get_url:需要爬取的目标网站
网页表数据如下:
table.txt:
得到的数据如下: 第一行为表名,后面为爬取的数据,最后一行加空行,
完整代码:
# -*- coding: utf-8 -*-
#python3
from bs4 import BeautifulSoup
from urllib import request
import requests
import csv
import bs4
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
import time
userAgent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
headers ={
#"Origin": "http://www.xx.com",
"Referer": "http://www.xxx.com",#
"user-Agent":userAgent,
}
#检查url地址
def check_link():
#登录教务系统的URL
LOGIN_URL = r'http://www.xxx.com'
#values = {'IDToken1': '201106******', 'IDToken2': '***********'}
values ={
"userCode": "xxxx",#username
"password": "xxxx",#password
}
print("start1")
#
postdata = urllib.parse.urlencode(values).encode()
#得到cookie
cookie_filename = 'cookie_jar.txt'
cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_jar)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
# print(response.read().decode())
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
# 保存cookie到cookie.txt中
cookie_jar.save(ignore_discard=True, ignore_expires=True)
for item in cookie_jar:
print('Name = ' + item.name)
print('Value = ' + item.value)
tablename=[]#
#从文件中读取表名
with open("table.txt","r") as f:
for line in f.readlines():
tablename.append(line.strip())
#print(tablename)
count=0 #
#每个表循环爬取字段
for col in tablename:
time.sleep(4)
#登录后爬取的url
get_url = 'http://www.xxx.com?tablenameme={tablenameme}'.format(tablenameme=col) # 利用cookie请求訪问还有一个网址
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
#记录爬取到哪个字段
print("{count}:{col}".format(count=count,col=col))
count=count+1
#爬取资源,解析网站,获取网页中table的td的值
soup = BeautifulSoup(get_response,'lxml')
#在先保存一个单独的网页在本地做测试
#soup = BeautifulSoup(open('C:/Users/user/Desktop/table/test.html'),'lxml')
trs = soup.find_all('td')
ulist=[]
for td in trs:
ulist.append(td.string)
save_contents(ulist,col)
#程序结束
print("success!!")
#保存资源
def save_contents(urlist,tablename):
#print(urlist)
n = 6
datas= [urlist[i:i+n] for i in range(0, len(urlist), n)]
#datas[0]=["a","b","c","d","e","f"]
#追加写入,末尾添加
with open("C:/Users/user/Desktop/table/gettable.csv",'a+',newline='') as f:
writer = csv.writer(f)
writer.writerow(["{name}".format(name=tablename)])
for row in datas:
writer.writerow(row) #按行添加
writer.writerow([' '])
if __name__ == '__main__':
check_link()