学习数据分析,然后没有合适的数据源,从国家统计局的网页上抓取一页数据来玩玩(没有发现robots协议,也仅仅发出一次连接请求,不对网站造成任何负荷)

运行效果

爬取70城房价到oracle数据库并6合1 Python 第1张爬取70城房价到oracle数据库并6合1 Python 第2张

SRE实战 互联网时代守护先锋,助力企业售后服务体系运筹帷幄!一键直达领取阿里云限量特价优惠。

源码

python代码

'''
本脚本旨在爬取70城房价进入oracle数据库以供学习
code by 九命猫幺

网页中有6个表格
    
最终爬取到数据库中形成6合1报表
'''
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

#爬取网页
def getHTMLText(url):
    try:
        headers={'User-Agent':'Baiduspider'}
        r = requests.get(url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '产生异常'

#解析出列表
def getTrText(tbody,tnum):
    uinfo1 = []
    uinfo2 = []
    for i in tbody.strings:
        if i != ' ':
            uinfo1.append(str(i.string).replace('\u3000','').replace(' ',''))
    for i in uinfo1:
        if i not in ['皇','岛', '家','庄','丹','江','尔','滨','顶','山']:
            uinfo2.append(i.replace('秦','秦皇岛').replace('石','石家庄').replace('牡','牡丹江').replace('哈','哈尔滨').replace('平','平顶山'))
    uinfo2 = uinfo2[{1:-280,2:-280,3:-350,4:-350,5:-350,6:-350}[tnum]::]
    return uinfo2

#将解析出的列表加工转换传入oracle库
def toSql(uinfo,tnum):
    if tnum in [1,2]:
        df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
    else:
        df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
    con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
    df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
    
    
if __name__ == "__main__":
    uinfo = []
    url = 'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html'
    
    #爬网页
    html = getHTMLText(url) 
    soup = BeautifulSoup(html,'html.parser')
    tbody = soup.select('table.MsoNormalTable tbody')
    #解析存储
    for i in range(6):
        #解析表         
        uinfo = getTrText(tbody[i],i+1)
        #存表入数据库
        toSql(uinfo,i+1)

数据库代码

--70个大中城市商品住宅销售价格变动情况
CREATE TABLE tb_fj_70city_201910 AS
WITH tmp1 AS(
SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
FROM tb_fj_70city_t1 a),
tmp2 AS(
SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
FROM tb_fj_70city_t2 a),
tmp3 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t3 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t4 a),
tmp4 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t5 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t6 a)
SELECT 201910 month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
FROM tmp1 aa
JOIN tmp2 bb ON aa.city=bb.city
JOIN tmp3 cc ON aa.city=cc.city
JOIN tmp4 dd ON aa.city=dd.city;

CALL p_drop_table_if_exist('tb_fj_70city_t1');
CALL p_drop_table_if_exist('tb_fj_70city_t2');
CALL p_drop_table_if_exist('tb_fj_70city_t3');
CALL p_drop_table_if_exist('tb_fj_70city_t4');
CALL p_drop_table_if_exist('tb_fj_70city_t5');
CALL p_drop_table_if_exist('tb_fj_70city_t6');

SELECT * FROM tb_fj_70city_201910;

就这样,表名中列名,取英文首字母:

mom:month on month ,环比

yoy:year on year,同比

fbr:fixed base ratio,定基比

90l:90 lower,90平米以下

144u:144 upper,144平米以上

90t144:90 to 144,90到144平米之间

优化后

上述脚本只能爬取一个月的,并且6表合1操作在数据库中执行,现在优化为批量爬取多个月份的数据


'''
本脚本旨在爬取70城房价进入oracle数据库以供学习
code by 九命猫幺

网页中有6个表格
    
最终爬取到数据库中形成6合1报表

网址:
'''
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import cx_Oracle

#爬取网页
def getHTMLText(url):
    try:
        headers={'User-Agent':'Baiduspider'}
        r = requests.get(url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '产生异常'

#解析出列表
def getTrText(tbody,tnum):
    uinfo1 = []
    uinfo2 = []
    for i in tbody.strings:
        if i != ' ':
            uinfo1.append(str(i.string).replace('\u3000','').replace(' ',''))
    for i in uinfo1:
        if i not in ['皇','岛', '家','庄','丹','江','尔','滨','顶','山']:
            uinfo2.append(i.replace('秦','秦皇岛').replace('石','石家庄').replace('牡','牡丹江').replace('哈','哈尔滨').replace('平','平顶山'))
    uinfo2 = uinfo2[{1:-280,2:-280,3:-350,4:-350,5:-350,6:-350}[tnum]::]
    return uinfo2

#将解析出的列表加工转换传入oracle库
def toSql(uinfo,tnum):
    if tnum in [1,2]:
        df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
    else:
        df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
    con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
    df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
  
#6合1 并插入历史宽表
def intoWideTable(month):
    con = cx_Oracle.connect('edw','oracle','192.168.168.5:1521/edw')
    cur = con.cursor()
    cur.execute("CALL p_drop_table_if_exist('tb_fj_70city_"+str(month)+"')")
    cur.execute('''CREATE TABLE tb_fj_70city_'''+str(month)+''' AS
WITH tmp1 AS(
SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
FROM tb_fj_70city_t1 a),
tmp2 AS(
SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
FROM tb_fj_70city_t2 a),
tmp3 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t3 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t4 a),
tmp4 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t5 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t6 a)
SELECT '''+str(month)+''' month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
FROM tmp1 aa
JOIN tmp2 bb ON aa.city=bb.city
JOIN tmp3 cc ON aa.city=cc.city
JOIN tmp4 dd ON aa.city=dd.city''')
    cur.close()
    con.close()
    
if __name__ == "__main__":
    uinfo = []
    urls = {201910:'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html',
            201909:'http://www.stats.gov.cn/tjsj/zxfb/201910/t20191021_1704063.html',
            201908:'http://www.stats.gov.cn/tjsj/zxfb/201909/t20190917_1697943.html',
            201907:'http://www.stats.gov.cn/statsinfo/auto2074/201908/t20190815_1691536.html',
            201906:'http://www.stats.gov.cn/tjsj/zxfb/201907/t20190715_1676000.html',
            201905:'http://www.stats.gov.cn/tjsj/zxfb/201906/t20190618_1670960.html',
            201904:'http://www.stats.gov.cn/tjsj/zxfb/201905/t20190516_1665286.html',
            201903:'http://www.stats.gov.cn/tjsj/zxfb/201904/t20190416_1659682.html'
            }
    for key in urls:
    #爬网页
        html = getHTMLText(urls[key]) 
        soup = BeautifulSoup(html,'html.parser')
        tbody = soup.select('table.MsoNormalTable tbody')
        #解析存储
        for i in range(6):
            #解析表
            uinfo = getTrText(tbody[i],i+1)
            #存表入数据库
            toSql(uinfo,i+1)
        #存入宽表
        intoWideTable(key)

爬取70城房价到oracle数据库并6合1 Python 第3张数据库中同时得到了多个月份的

再优化单一月份爬取的代码


import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import cx_Oracle
 
#爬取网页
def getHTMLText(url):
    try:
        headers={'User-Agent':'Baiduspider'}
        r = requests.get(url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '产生异常'
 
#解析出列表
def getTrText(tbody,tnum):
    uinfo1 = []
    uinfo2 = ['...']
    for i in tbody.strings:
        if i not in ['  ',' ']:
            uinfo1.append(str(i.string).replace('  ',''))
    for i in uinfo1:
        if '\u4e00' <= i <= '\u9fa5' and '\u4e00' <= uinfo2[-1] <= '\u9fa5':
            uinfo2[-1]=uinfo2[-1]+i
        else:
            uinfo2.append(i)
    uinfo2 = uinfo2[{1:-280,2:-280}.get(tnum,-350)::]
    return uinfo2
 
#将解析出的列表加工转换传入oracle库
def toSql(uinfo,tnum):
    if tnum in [1,2]:
        df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
    else:
        df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
    con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
    df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
     
#6合1 并插入历史宽表
def intoWideTable(month):
    con = cx_Oracle.connect('edw','oracle','192.168.168.5:1521/edw')
    cur = con.cursor()
    cur.execute("CALL p_drop_table_if_exist('tb_fj_70city_"+str(month)+"')")
    cur.execute('''CREATE TABLE tb_fj_70city_'''+str(month)+''' AS
WITH tmp1 AS(
SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
FROM tb_fj_70city_t1 a),
tmp2 AS(
SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
FROM tb_fj_70city_t2 a),
tmp3 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t3 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
FROM tb_fj_70city_t4 a),
tmp4 AS(
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t5 a
UNION
SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
FROM tb_fj_70city_t6 a)
SELECT '''+str(month)+''' month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
FROM tmp1 aa
JOIN tmp2 bb ON aa.city=bb.city
JOIN tmp3 cc ON aa.city=cc.city
JOIN tmp4 dd ON aa.city=dd.city''')
    cur.execute('delete from tb_fj_70city where month='+str(month))
    cur.execute('insert into tb_fj_70city select * from tb_fj_70city_'+str(month))
    con.commit()
    cur.close()
    con.close()
     
if __name__ == "__main__":
    uinfo = []
    month = 201910
    url = 'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html'
     
    #爬网页
    html = getHTMLText(url)
    soup = BeautifulSoup(html,'html.parser')
    tbody = soup.select('table.MsoNormalTable tbody')
    #解析存储
    for i in range(6):
        #解析表        
        uinfo = getTrText(tbody[i],i+1)
        #存表入数据库
        toSql(uinfo,i+1)
    intoWideTable(month)


ok了

扫码关注我们
微信号:SRE实战
拒绝背锅 运筹帷幄