Hannoch Front-end Dev Engineer

python3 用cookie模拟登录,爬取表格中的数据

2018-12-28
Hannoch

python3 用cookie模拟登录,爬取表格中的数据

需求: 1)爬取网页中有三百多张表格 2)存储到excel中第一行为表名,后面为爬取的数据,最后一行加空行,爬取到的数据都放在同一个工作簿中。

思路:简单登录无验证码,先用账号密码登录,得到cookie并保存到本地,访问目标网站时带上cookie

LOGIN_URL:登录网站 get_url:需要爬取的目标网站

网页表数据如下: image.png

table.txt: image.png

得到的数据如下: 第一行为表名,后面为爬取的数据,最后一行加空行, image.png

完整代码:

# -*- coding: utf-8 -*-
#python3

from bs4 import BeautifulSoup
from urllib import request
import requests
import csv
import bs4
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
import time


userAgent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
headers ={
    #"Origin": "http://www.xx.com",
    "Referer": "http://www.xxx.com",#
    "user-Agent":userAgent,
}

#检查url地址
def check_link():
	#登录教务系统的URL
    LOGIN_URL = r'http://www.xxx.com'
     #values = {'IDToken1': '201106******', 'IDToken2': '***********'}
    values ={
        "userCode": "xxxx",#username
        "password": "xxxx",#password
    }
    print("start1")
    #
    postdata = urllib.parse.urlencode(values).encode()
    #得到cookie
    cookie_filename = 'cookie_jar.txt'
    cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)
    handler = urllib.request.HTTPCookieProcessor(cookie_jar)
    opener = urllib.request.build_opener(handler)

    request = urllib.request.Request(LOGIN_URL, postdata, headers)
    try:
        response = opener.open(request)
        # print(response.read().decode())
    except urllib.error.URLError as e:
        print(e.code, ':', e.reason)

    # 保存cookie到cookie.txt中    
    cookie_jar.save(ignore_discard=True, ignore_expires=True)  
    for item in cookie_jar:
        print('Name = ' + item.name)
        print('Value = ' + item.value)

    tablename=[]#
    #从文件中读取表名
    with open("table.txt","r") as f:
        for line in f.readlines():
            tablename.append(line.strip())
    #print(tablename)
    count=0 #
    #每个表循环爬取字段
    for col in tablename:
        time.sleep(4)
        #登录后爬取的url    
        get_url = 'http://www.xxx.com?tablenameme={tablenameme}'.format(tablenameme=col)  # 利用cookie请求訪问还有一个网址
        get_request = urllib.request.Request(get_url, headers=headers)
        get_response = opener.open(get_request)
        #记录爬取到哪个字段
        print("{count}:{col}".format(count=count,col=col))
        count=count+1
        #爬取资源,解析网站,获取网页中table的td的值
        soup = BeautifulSoup(get_response,'lxml')
        #在先保存一个单独的网页在本地做测试
        #soup = BeautifulSoup(open('C:/Users/user/Desktop/table/test.html'),'lxml')
        trs = soup.find_all('td')  
        ulist=[]
        for td in trs:
            ulist.append(td.string)
        save_contents(ulist,col)
    #程序结束
    print("success!!")        
        
 
        
#保存资源
def save_contents(urlist,tablename):
	
    #print(urlist)
    n = 6
    datas= [urlist[i:i+n] for i in range(0, len(urlist), n)]
    #datas[0]=["a","b","c","d","e","f"]
    #追加写入,末尾添加
    with open("C:/Users/user/Desktop/table/gettable.csv",'a+',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["{name}".format(name=tablename)])
        for row in datas:
        	writer.writerow(row) #按行添加
        writer.writerow([' '])
 
if __name__ == '__main__':
    check_link()


Similar Posts

Comments