jake
jake
16天前 · 2 人阅读

爬虫这里主要是请求页面,然后是正则表达式过滤,基础的有Python的数据类型 tuple, list, str。

进群:943752371   即可获取数十套PDF!

# coding = UTF-8

# Citizen's salary of 0 Dn

# Clerk's salary of 2 Dn

# Engineer's salary of 5 Dn

# Architect's salary of 8 Dn

# Quaestor's salary of 12 Dn

# Procurator's salary of 20 Dn

# Aedile's salary of 30 Dn

# Praetor's salary of 40 Dn

# Consul's salary of 60 Dn

# Proconsul's salary of 80 Dn

# Caesar's salary of 100 Dn

# Python Scraper

from pprint import pprint

from inspect import getmembers

import urllib.request

import json

import re

import os

# open the url and read

def getHtml(url):

page = urllib.request.urlopen(url, None, timeout=19)

html = page.read()

page.close()

return html

def getFile(url):

file_name = url.split('/')[-1]

u = urllib.request.urlopen(url, None, timeout=19)

f = open(file_name, 'wb')

block_sz = 8192

while True:

buffer = u.read(block_sz)

if not buffer:

break

f.write(buffer)

f.close()

print ("Sucessful to download" + " " + file_name)

# compile the regular expressions and find

# all stuff we need

def getUrl(html):

reg = r'(?:href|HREF)="?((?:http://)?.+?.pdf)'

url_re = re.compile(reg)

url_lst = url_re.findall(html.decode('gb2312'))

return(url_lst)

def getTS(html):

reg = r'(d+.ts)'

reg = re.compile(reg)

items = reg.findall(html.decode('gb2312'))

return(items)

def getAnchors(html):

reg = r'(?=href="(https://www.kuaidi100.com/(global|all)/[^"]+)"(?=[^"])+>([^<]+)<)'

reg = re.compile(reg)

items = reg.findall(html.decode('utf8'))

return(items)

def getExpressSql(items):

sqls = [];

for item in items:

# pprint (item[1]+"=>"+item[0])

html = getHtml(item[0])

page = html.decode('utf8')

page = page.replace("","").replace("", "")

reg = r'(?=companyCode.+?value="(.+?)".+?(?=客服电话">([^<]*)<).+?(?=selectComBtn".+?title="(.+?)"(?=>

# reg = r'(?=companyCode.+?value="(.+?)")' (?=访问官网" href="(.*?)").+?

# reg = r'(?=客服电话">(.+?)<)'

fields = re.compile(reg).findall(page)

vals = "".join( list(map(lambda x: str(x), fields)) )

# sql = "insert into wxapp_express values(%s)"%(vals)

sql = "insert into wxapp_express values "+vals

sqls.append(sql)

print( sql )

return sqls

def express_scraper():

url = 'https://www.kuaidi100.com/global/'

url = 'https://www.kuaidi100.com/all/'

html = getHtml(url)

anchors = getAnchors(html)

pprint( anchors )

sqls = getExpressSql(anchors)

pprint( sqls )

express_scraper()

抓取的是快递公司信息,并生成sql语句:

insert into wxapp_express values ('bangladesh', '', '孟加拉国(EMS)', 'https://cdn.kuaidi100.com/images/all/56/bangladesh.png')

insert into wxapp_express values ('canpost', '', '加拿大(Canada Post)', 'https://cdn.kuaidi100.com/images/all/56/canpost.png')

insert into wxapp_express values ('ceskaposta', '', '捷克(?eská po?ta)', 'https://cdn.kuaidi100.com/images/all/56/ceskaposta.png')

insert into wxapp_express values ('chronopostfren', '', '法国大包、EMS-英文(Chronopost France)', 'https://cdn.kuaidi100.com/images/all/56/chronopostfren.png')

insert into wxapp_express values ('correosdees', '', '西班牙(Correos de Espa?a)', 'https://cdn.kuaidi100.com/images/all/56/correosdees.png')

insert into wxapp_express values ('cypruspost', '', '塞浦路斯(Cyprus Post)', 'https://cdn.kuaidi100.com/images/all/56/cypruspost.png')

insert into wxapp_express values ('dpexen', '', 'Toll', 'https://cdn.kuaidi100.com/images/all/56/dpexen.png')

insert into wxapp_express values ('eltahell', '', '希腊EMS(ELTA Courier)', 'https://cdn.kuaidi100.com/images/all/56/eltahell.png')

insert into wxapp_express values ('emsukraine', '', '乌克兰EMS(EMS Ukraine)', 'https://cdn.kuaidi100.com/images/all/56/emsukraine.png')

insert into wxapp_express values ('fardarww', '', '颿达国际快递', 'https://cdn.kuaidi100.com/images/all/56/fardarww.png')

insert into wxapp_express values ('fastway', '', 'Fastway Ireland', 'https://cdn.kuaidi100.com/images/all/56/fastway.png')

insert into wxapp_express values ('fedexuk', '', 'FedEx-英国件(FedEx UK)', 'https://cdn.kuaidi100.com/images/all/56/fedexuk.png')

insert into wxapp_express values ('gatikwe', '', 'Gati-KWE', 'https://cdn.kuaidi100.com/images/all/56/gatikwe.png')

insert into wxapp_express values ('hkpost', '', '中国香港(HongKong Post)', 'https://cdn.kuaidi100.com/images/all/56/hkpost.png')

insert into wxapp_express values ('hrvatska', '', '克罗地亚(Hrvatska Posta)', 'https://cdn.kuaidi100.com/images/all/56/hrvatska.png')

insert into wxapp_express values ('interlink', '', 'Interlink Express', 'https://cdn.kuaidi100.com/images/all/56/interlink.png')

insert into wxapp_express values ('iparcel', '', 'UPS i-parcel', 'https://cdn.kuaidi100.com/images/all/56/iparcel.png')

insert into wxapp_express values ('italysad', '', 'Italy SDA', 'https://cdn.kuaidi100.com/images/all/56/italysad.png')

insert into wxapp_express values ('japanposten', '', '日本(Japan Post)', 'https://cdn.kuaidi100.com/images/all/56/japanposten.png')

insert into wxapp_express values ('kcs', '', 'KCS', 'https://cdn.kuaidi100.com/images/all/56/kcs.png')

insert into wxapp_express values ('koreapost', '', '韩国(Korea Post)', 'https://cdn.kuaidi100.com/images/all/56/koreapost.png')

insert into wxapp_express values ('lasership', '', 'LaserShip', 'https://cdn.kuaidi100.com/images/all/56/lasership.png')

insert into wxapp_express values ('latvia', '', '拉脱维亚(Latvijas Pasts)', 'https://cdn.kuaidi100.com/images/all/56/latvia.png')

insert into wxapp_express values ('lianbangkuaidi', '', '联邦快递', 'https://cdn.kuaidi100.com/images/all/56/lianbangkuaidi.png')

insert into wxapp_express values ('libanpost', '', '黎巴嫩(Liban Post)', 'https://cdn.kuaidi100.com/images/all/56/libanpost.png')

insert into wxapp_express values ('lithuania', '', '立陶宛(Lietuvos pa?tas)', 'https://cdn.kuaidi100.com/images/all/56/lithuania.png')

insert into wxapp_express values ('macedonia', '', '马其顿(Macedonian Post)', 'https://cdn.kuaidi100.com/images/all/56/macedonia.png')

insert into wxapp_express values ('maldives', '', '马尔代夫(Maldives Post)', 'https://cdn.kuaidi100.com/images/all/56/maldives.png')

insert into wxapp_express values ('malta', '', '马耳他(Malta Post)', 'https://cdn.kuaidi100.com/images/all/56/malta.png')

insert into wxapp_express values ('mexico', '', '墨西哥(Correos de Mexico)', 'https://cdn.kuaidi100.com/images/all/56/mexico.png')

insert into wxapp_express values ('mexicodenda', '', 'Mexico Senda Express', 'https://cdn.kuaidi100.com/images/all/56/mexicodenda.png')

insert into wxapp_express values ('moldova', '', '摩尔多瓦(Posta Moldovei)', 'https://cdn.kuaidi100.com/images/all/56/moldova.png')

insert into wxapp_express values ('mrw', '', 'MRW', 'https://cdn.kuaidi100.com/images/all/56/mrw.png')

insert into wxapp_express values ('multipack', '', 'Mexico Multipack', 'https://cdn.kuaidi100.com/images/all/56/multipack.png')

insert into wxapp_express values ('myhermes', '', 'MyHermes', 'https://cdn.kuaidi100.com/images/all/56/myhermes.png')

insert into wxapp_express values ('nigerianpost', '', '尼日利亚(Nigerian Postal)', 'https://cdn.kuaidi100.com/images/all/56/nigerianpost.png')

insert into wxapp_express values ('novaposhta', '', 'Nova Poshta', 'https://cdn.kuaidi100.com/images/all/56/novaposhta.png')

insert into wxapp_express values ('ocaargen', '', 'OCA Argentina', 'https://cdn.kuaidi100.com/images/all/56/ocaargen.png')

insert into wxapp_express values ('ontrac', '', 'OnTrac', 'https://cdn.kuaidi100.com/images/all/56/ontrac.png')

insert into wxapp_express values ('opek', '', 'OPEK', 'https://cdn.kuaidi100.com/images/all/56/opek.png')

insert into wxapp_express values ('portugalctt', '', '葡萄牙(Portugal CTT)', 'https://cdn.kuaidi100.com/images/all/56/portugalctt.png')

insert into wxapp_express values ('portugalseur', '', 'Portugal Seur', 'https://cdn.kuaidi100.com/images/all/56/portugalseur.png')

insert into wxapp_express values ('postenab', '', 'PostNord(Posten AB)', 'https://cdn.kuaidi100.com/images/all/56/postenab.png')

insert into wxapp_express values ('postennorge', '', '挪威(Posten Norge)', 'https://cdn.kuaidi100.com/images/all/56/postennorge.png')

insert into wxapp_express values ('postnl', '', '荷兰邮政(PostNL international registered mail)', 'https://cdn.kuaidi100.com/images/all/56/postnl.png')

insert into wxapp_express values ('purolator', '', 'Purolator', 'https://cdn.kuaidi100.com/images/all/56/purolator.png')

insert into wxapp_express values ('redexpress', '', 'Red Express', 'https://cdn.kuaidi100.com/images/all/56/redexpress.png')

insert into wxapp_express values ('romanian', '', '罗马尼亚(Posta Romanian)', 'https://cdn.kuaidi100.com/images/all/56/romanian.png')

insert into wxapp_express values ('ruidianyouzheng', '', '瑞典(Sweden Post)', 'https://cdn.kuaidi100.com/images/all/56/ruidianyouzheng.png')

insert into wxapp_express values ('safexpress', '', 'Safexpress', 'https://cdn.kuaidi100.com/images/all/56/safexpress.png')

insert into wxapp_express values ('saudipost', '', '沙特阿拉伯(Saudi Post)', 'https://cdn.kuaidi100.com/images/all/56/saudipost.png')

insert into wxapp_express values ('selektvracht', '', 'Selektvracht', 'https://cdn.kuaidi100.com/images/all/56/selektvracht.png')

insert into wxapp_express values ('serbia', '', '塞尔维亚(PE Post of Serbia)', 'https://cdn.kuaidi100.com/images/all/56/serbia.png')

insert into wxapp_express values ('seur', '', 'International Seur', 'https://cdn.kuaidi100.com/images/all/56/seur.png')

insert into wxapp_express values ('singpost', '', '新加坡小包(Singapore Post)', 'https://cdn.kuaidi100.com/images/all/56/singpost.png')

insert into wxapp_express values ('sinoex', '', '中外运速递-中文', 'https://cdn.kuaidi100.com/images/all/56/sinoex.png')

insert into wxapp_express values ('siodemka', '', 'Siodemka', 'https://cdn.kuaidi100.com/images/all/56/siodemka.png')

insert into wxapp_express values ('skynetmalaysia', '', 'SkyNet Malaysia', 'https://cdn.kuaidi100.com/images/all/56/skynetmalaysia.png')

insert into wxapp_express values ('southafrican', '', '南非(South African Post Office)', 'https://cdn.kuaidi100.com/images/all/56/southafrican.png')

insert into wxapp_express values ('swisspost', '', '瑞士(Swiss Post)', 'https://cdn.kuaidi100.com/images/all/56/swisspost.png')

insert into wxapp_express values ('tcixps', '', 'TCI XPS', 'https://cdn.kuaidi100.com/images/all/56/tcixps.png')

insert into wxapp_express values ('thailand', '', '泰国(Thailand Thai Post)', 'https://cdn.kuaidi100.com/images/all/56/thailand.png')

insert into wxapp_express values ('tnt', '', 'TNT', 'https://cdn.kuaidi100.com/images/all/56/tnt.png')

insert into wxapp_express values ('tntau', '', 'TNT Australia', 'https://cdn.kuaidi100.com/images/all/56/tntau.png')

insert into wxapp_express values ('tntitaly', '', 'TNT Italy', 'https://cdn.kuaidi100.com/images/all/56/tntitaly.png')

insert into wxapp_express values ('tntpostcn', '', 'TNT Post', 'https://cdn.kuaidi100.com/images/all/56/tntpostcn.png')

insert into wxapp_express values ('tntuk', '', 'TNT UK', 'https://cdn.kuaidi100.com/images/all/56/tntuk.png')

insert into wxapp_express values ('tollpriority', '', 'Toll Priority(Toll Online)', 'https://cdn.kuaidi100.com/images/all/56/tollpriority.png')

insert into wxapp_express values ('tunisia', '', '突尼斯EMS(Rapid-Poste)', 'https://cdn.kuaidi100.com/images/all/56/tunisia.png')

insert into wxapp_express values ('ukrpost', '', '乌克兰小包、大包(UkrPost)', 'https://cdn.kuaidi100.com/images/all/56/ukrpost.png')

insert into wxapp_express values ('ups', '', 'UPS', 'https://cdn.kuaidi100.com/images/all/56/ups.png')

insert into wxapp_express values ('upsfreight', '', 'UPS Freight', 'https://cdn.kuaidi100.com/images/all/56/upsfreight.png')

insert into wxapp_express values ('usps', '', 'USPS', 'https://cdn.kuaidi100.com/images/all/56/usps.png')

insert into wxapp_express values ('usps', '', 'USPS', 'https://cdn.kuaidi100.com/images

收藏 0
神奇 快递 100 python 抓取
评论 ( 0 )