import requests import re from functools import reduce import json import base64 import time import os -------- 安装的python包---------------
pwd = base64.b64decode("") #64位解码 loginMeta = { "username":'',"pwd":pwd} #用户名和密码 msession = requests.Session() ret = msession.post("http://uis.shou.edu.cn/cas/login?isLoginService=11&service=http://ecampus.shou.edu.cn/c/portal/login", { "username":loginMeta['username'], "password":loginMeta['pwd'], "submit":"" },allow_redirects = False) ret = msession.get(ret.headers['Location'],allow_redirects = False) Jsession = ret.headers['Set-Cookie'].split(';')[0] ret = msession.get(ret.headers['Location'],allow_redirects = False) Jheaders = { 'Cookie':'COOKIE_SUPPORT=true; JSESSIONID=%s; GUEST_LANGUAGE_ID=zh_CN'%Jsession} getASessionUrl = '' ret = msession.get(getASessionUrl,headers=Jheaders,allow_redirects=False) while 'Location' in ret.headers: ret = msession.get(ret.headers['Location']) fwUrl = "" ret = msession.get(fwUrl,headers=Jheaders,allow_redirects=False) while 'Location' in ret.headers: ret = msession.get(ret.headers['Location']) ACookies = requests.utils.dict_from_cookiejar(msession.cookies) ret = msession.get('') ret = msession.get('') ------------登录部分长久保存cookie-------------------------------------
def parseOrderInfo(content): content = content.replace("\r", '').replace('\n', '').replace('\t', ' ') eles = re.findall('(.*?)", "\n\\1", ele) #subn替换函数 p = re.subn(" ", "", p[0]) p = p[0] p = list(filter(lambda x: x if len(x.strip()) > 1 else None, p.split('\n'))) print(p) if (len(p) > 6): cinfo = {} cinfo["orderId"] = p[1].strip() cinfo["project"] = p[2].strip() cinfo["reason"] = p[3].strip() cinfo["pay"] = p[4].strip() cinfo["date"] = p[6].strip() orders += [cinfo] else: raise Exception("too LONG order Description") return orders ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow', data={ 'DepartProject': '', 'Depart': '', 'depname': '', 'Object': '', 'projectname': '', 'OrderStartTime': '', 'OrderEndTime': '', 'OrderNo': '', 'OrderState': '1,2,3,4,5,8,-1', 'ExpenBusinessType': '', 'currentPageIndex': '1', 'num': '1', 'isture': 'false', 'ProxyPerson': '', 'OrderRemark': ''}, headers={ 'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s' % (ACookies["ASP.NET_SessionId"], ACookies["SFP_Verify_Cookie"]), 'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'})
time.sleep(2) seaContent = ret.content.decode() orderInfo = parseOrderInfo(seaContent) orders = re.findall("SFP_ClaimsSelf/OrderQuery/PrintOrder\?OrderNo=(\\d+)", seaContent) # pages=2 pages = int(re.findall("pagecount: '(\\d*)'", seaContent)[0])
if pages > 1: for i in range(1, pages + 1): if i == 1: ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow', #提交post表单 data={ 'DepartProject': '', 'Depart': '', 'depname': '', 'Object': '', 'projectname': '', 'OrderStartTime': '', 'OrderEndTime': '', 'OrderNo': '', 'OrderState': '1,2,3,4,5,8,-1', 'ExpenBusinessType': '', 'currentPageIndex': '1', 'num': '1', 'isture': 'false', 'ProxyPerson': '', 'OrderRemark': ''}, headers={ 'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s' % (ACookies["ASP.NET_SessionId"], ACookies["SFP_Verify_Cookie"]), 'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}) else: ret = msession.post(url='http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderShow', data={ 'DepartProject': '', 'Depart': '', 'depname': '', 'Object': '', 'projectname': '', 'OrderStartTime': '', 'OrderEndTime': '', 'OrderNo': '', 'OrderState': '1,2,3,4,5,8,-1', 'ExpenBusinessType': '', 'currentPageIndex': '%d' % i, 'num': '2', 'isture': 'false', 'ProxyPerson': '', 'OrderRemark': ''}, headers={ 'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s' % (ACookies["ASP.NET_SessionId"], ACookies["SFP_Verify_Cookie"]), 'Referer': 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/OrderIndex', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'})
seaContent = ret.content.decode() orders += re.findall("SFP_ClaimsSelf/OrderQuery/PrintOrder\?OrderNo=(\\d+)", seaContent) orderInfo += parseOrderInfo(seaContent) time.sleep(1) # orderprint for orderId in orders: Url = 'http://cwc1.shou.edu.cn:82/SFP_ClaimsSelf/OrderQuery/PrintOrder?OrderNo=' printUrl = Url + orderId Number = int(i) # print(printUrl) result = requests.get(url=printUrl, headers={ 'Cookie': 'ASP.NET_SessionId=%s; SFP_Verify_Cookie=%s' % (ACookies["ASP.NET_SessionId"], ACookies["SFP_Verify_Cookie"]), 'Referer': ret.url, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }) address="E:/totally/FinancePDF"+"/" #pdf文件存储地址 fileName=orderId+".pdf" if os.path.isfile(address+fileName): print(fileName+'文件已存在') else: with open(address+orderId+".pdf", "wb") as f: f.write(result.content)
sumInfo = { "detail": orders} print(json.dumps(sumInfo, indent=4)) ------------------提交表单部分(表单内容不能少,否则会停止爬取)-------------------------------------- if __name__ == '__main__': parseOrderInfo()