代码:- from requests import get
- from bs4 import BeautifulSoup
- from user_agent import generate_user_agent
- import time
- def request_content(start, end):
- url_link = 'https://datachart.500.com/ssq/history/newinc/history.php?start={0}&end={1}'.format(start, end)
- headers = {
- 'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux', 'win', 'android'))
- }
- response = get(url_link, headers=headers, timeout=6)
- page_content = BeautifulSoup(response.content, "html.parser")
- html_tag = page_content.find_all('tbody', id='tdata')[0]
- return html_tag.find_all('tr', 't_tr1')
- class ssqclazz:
- def __init__(self):
- self.period = '' # 期号
- self.red_1 = '' # 红球
- self.red_2 = ''
- self.red_3 = ''
- self.red_4 = ''
- self.red_5 = ''
- self.red_6 = ''
- self.blue_1 = '' # 蓝球
- self.happy_sunday = '' # 快乐星期天
- self.pool_prize = '' # 奖池奖金(元)
- self.first_count = '' # 一等奖 注数
- self.first_prize = '' # 一等奖 奖金(元)
- self.second_count = '' # 二等奖 注数
- self.second_prize = '' # 二等奖 奖金(元)
- self.total_prize = '' # 总投注额(元)
- self.lottery_date = '' # 开奖日期
-
- def __str__(self):
- return '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}'.format(self.period, self.red_1,
- self.red_2, self.red_3,
- self.red_4, self.red_5,
- self.red_6,
- self.blue_1,
- self.happy_sunday,
- self.pool_prize,
- self.first_count,
- self.first_prize,
- self.second_count,
- self.second_prize,
- self.total_prize,
- self.lottery_date)
-
- def tr_tag(self, tag):
- tds = tag.find_all('td')
- index = 0
- self.period = tds[index].string
- index += 1
- self.red_1 = tds[index].string
- index += 1
- self.red_2 = tds[index].string
- index += 1
- self.red_3 = tds[index].string
- index += 1
- self.red_4 = tds[index].string
- index += 1
- self.red_5 = tds[index].string
- index += 1
- self.red_6 = tds[index].string
- index += 1
- self.blue_1 = tds[index].string
- index += 1
- self.happy_sunday = tds[index].string
- index += 1
- self.pool_prize = tds[index].string
- index += 1
- self.first_count = tds[index].string
- index += 1
- self.first_prize = tds[index].string
- index += 1
- self.second_count = tds[index].string
- index += 1
- self.second_prize = tds[index].string
- index += 1
- self.total_prize = tds[index].string
- index += 1
- self.lottery_date = tds[index].string
- if __name__ == '__main__':
- file = open('ssq.txt', mode='a+', encoding='utf-8')
- localtime = time.localtime(time.time())
- lyear = localtime.tm_year
- ymin = 3 # 双色球03年上线
- ymax = lyear - 2000
- print('===抓取数据开始===,200%s-20%s' % (ymin, ymax))
- for year in range(ymin, ymax + 1):
- start = '{0}001'.format(year)
- end = '{0}300'.format(year)
- trs = request_content(start, end)
- for tr in trs:
- ssqobj = ssqclazz()
- ssqobj.tr_tag(tr)
- objstr = ssqobj.__str__()
- file.write(objstr)
- file.write('\n')
- print(objstr)
- file.write('\n')
- print()
- time.sleep(3)
- file.close()
- print('抓取完毕!!!')
复制代码 节取2020年的数据如下:- 20089,02,14,16,21,29,30,10, ,0,0,0,0,0,0,2020-09-13
- 20088,01,06,12,26,29,30,12, ,1,049,789,346,14,6,377,823,175,137,782,359,763,320,2020-09-10
- 20087,11,15,20,23,25,33,10, ,1,066,743,130,5,7,940,558,44,417,692,351,393,474,2020-09-08
- 20086,02,04,06,15,24,27,06, ,1,051,310,454,7,7,452,652,117,183,424,383,150,704,2020-09-06
- 20085,01,02,05,09,19,24,16, ,1,039,096,893,3,10,000,000,135,175,949,349,243,256,2020-09-03
- 20084,03,07,16,17,23,30,07, ,997,837,500,15,5,641,465,265,45,386,340,223,360,2020-09-01
- 20083,01,19,25,26,30,31,12, ,1,046,377,044,7,7,916,189,96,265,798,375,628,538,2020-08-30
- 20082,02,08,11,17,21,30,09, ,1,025,240,399,22,5,094,481,320,8,119,339,738,944,2020-08-27
- 20081,01,05,13,14,27,33,15, ,1,129,524,256,11,7,057,769,108,261,984,341,884,184,2020-08-25
- 20080,14,15,18,22,31,33,01, ,1,122,276,717,2,10,000,000,70,397,581,375,199,530,2020-08-23
- 20079,05,12,20,21,22,29,14, ,1,058,784,536,6,8,578,055,183,146,641,347,031,342,2020-08-20
- 20078,03,11,14,16,21,32,04, ,1,029,746,619,1,10,000,000,76,347,407,339,766,446,2020-08-18
- 20077,03,10,16,21,25,27,12, ,960,537,693,27,5,841,121,326,87,079,376,430,570,2020-08-16
- 20076,10,15,16,18,20,27,06, ,1,033,084,399,8,6,895,903,82,231,207,346,015,916,2020-08-13
- 20075,03,11,13,20,24,30,16, ,1,031,374,526,1,10,000,000,143,155,685,342,593,198,2020-08-11
- 20074,04,08,09,13,19,33,12, ,974,585,503,5,8,629,293,107,211,991,375,478,220,2020-08-09
- 20073,05,07,11,13,27,29,03, ,949,682,723,8,7,005,129,129,155,436,344,984,126,2020-08-06
- 20072,06,08,10,15,17,26,04, ,945,569,865,7,7,525,472,104,212,479,335,806,408,2020-08-04
- 20071,09,11,12,13,22,23,08, ,931,954,524,12,6,156,653,133,130,449,363,190,440,2020-08-02
- 20070,01,02,04,06,19,21,15, ,953,784,945,23,5,829,067,100,238,356,343,442,594,2020-07-30
- 20069,03,09,10,13,18,26,04, ,1,016,346,427,7,7,827,856,99,249,936,343,151,528,2020-07-28
- 20068,12,16,21,26,27,32,10, ,996,910,195,17,5,430,373,128,71,448,376,630,286,2020-07-26
- 20067,04,07,09,23,27,30,08, ,1,061,790,227,10,6,979,941,140,176,780,353,153,516,2020-07-23
- 20066,02,09,13,17,26,28,07, ,1,057,341,835,15,6,006,277,137,137,720,352,149,702,2020-07-21
- 20065,09,15,18,21,23,26,08, ,1,090,832,907,5,8,838,315,89,269,544,381,062,334,2020-07-19
- 20064,01,03,07,21,27,32,01, ,1,063,056,063,1,10,000,000,77,351,356,356,057,386,2020-07-16
- 20063,12,15,16,22,29,32,14, ,991,892,696,3,10,000,000,103,289,264,357,076,054,2020-07-14
- 20062,10,14,17,22,26,27,05, ,932,509,919,5,8,901,954,94,259,438,391,827,962,2020-07-12
- 20061,08,17,24,26,27,31,04, ,903,858,043,1,10,000,000,91,281,269,366,447,904,2020-07-09
- 20060,05,09,14,20,24,30,08, ,837,071,557,38,5,238,209,554,20,424,356,937,202,2020-07-07
- 20059,02,04,10,17,22,25,14, ,1,002,178,704,4,10,000,000,190,147,308,386,807,160,2020-07-05
- 20058,01,03,11,12,19,26,07, ,958,212,978,18,5,569,985,136,94,299,356,052,050,2020-07-02
- 20057,09,14,21,23,26,32,03, ,1,019,998,705,3,10,000,000,113,193,145,349,147,892,2020-06-30
- 20056,02,05,08,12,26,31,14, ,984,522,456,17,6,280,167,186,146,255,380,233,782,2020-06-28
- 20055,01,05,07,23,28,30,12, ,1,009,674,637,14,5,761,052,198,67,264,327,140,694,2020-06-25
- 20054,03,10,19,25,26,31,02, ,1,050,374,135,11,6,596,678,210,104,544,338,817,016,2020-06-23
- 20053,02,14,15,16,32,33,01, ,1,057,074,616,19,6,277,278,360,84,264,381,666,534,2020-06-21
- 20052,02,08,13,29,32,33,15, ,1,085,336,824,2,10,000,000,53,568,303,352,815,504,2020-06-18
- 20051,03,06,08,11,19,28,08, ,1,014,976,572,23,5,442,583,268,47,478,342,968,302,2020-06-16
- 20050,04,09,17,20,32,33,15, ,1,101,983,169,5,9,664,099,90,323,895,379,409,292,2020-06-14
- 20049,01,07,09,12,18,22,04, ,1,062,851,807,7,7,652,110,140,165,756,355,402,476,2020-06-11
- 20048,12,14,18,23,30,32,02, ,1,046,798,674,6,8,364,818,75,336,481,351,604,616,2020-06-09
- 20047,04,10,17,19,28,32,01, ,1,021,279,165,10,7,200,661,140,196,487,382,084,220,2020-06-07
- 20046,13,19,25,26,27,32,08, ,1,010,760,962,7,6,855,509,87,186,617,350,122,868,2020-06-04
- 20045,02,03,15,21,22,33,16, ,1,010,042,406,16,6,075,531,115,187,048,348,295,556,2020-06-02
- 20044,07,08,16,17,19,24,07, ,1,042,719,025,10,6,803,090,107,210,641,379,328,476,2020-05-31
- 20043,01,11,24,25,27,30,07, ,1,043,134,035,5,7,928,059,64,285,943,360,394,276,2020-05-28
- 20042,02,06,07,11,14,31,03, ,1,027,873,219,5,8,415,115,98,217,800,345,472,844,2020-05-26
- 20041,06,07,16,22,24,25,13, ,1,005,915,378,2,10,000,000,102,255,217,372,422,582,2020-05-24
- 20040,01,06,12,16,19,21,04, ,947,818,891,5,8,416,563,92,232,103,341,533,118,2020-05-21
- 20039,02,09,10,11,16,29,02, ,925,841,149,9,6,952,125,166,132,297,341,609,556,2020-05-19
- 20038,01,06,07,18,23,24,15, ,922,526,038,15,6,276,646,317,75,511,374,760,420,2020-05-17
- 20037,01,04,11,13,17,24,15, ,944,864,355,6,6,810,277,65,208,878,350,126,486,2020-05-14
- 20036,07,09,16,22,24,32,06, ,944,994,783,3,10,000,000,104,204,187,346,455,736,2020-05-12
- 20035,10,14,24,25,28,33,11, ,911,288,358,4,9,777,104,97,246,242,371,811,232,2020-05-10
- 20034,02,08,15,16,26,32,03, ,878,740,204,11,6,780,706,182,134,531,340,793,680,2020-05-07
- 20033,07,10,12,21,31,32,01, ,879,873,846,4,10,000,000,97,263,480,320,555,362,2020-05-05
- 20032,03,11,13,14,15,26,13, ,843,200,936,11,6,588,951,135,161,837,338,904,002,2020-05-03
- 20031,02,05,09,15,16,27,09, ,850,135,149,14,5,603,722,310,34,081,323,903,392,2020-04-30
- 20030,17,18,21,29,30,32,03, ,896,891,829,1,10,000,000,74,322,591,327,786,632,2020-04-28
- 20029,01,12,18,20,30,32,05, ,835,276,431,7,7,432,574,63,337,857,364,268,814,2020-04-26
- 20028,05,06,15,18,26,32,08, ,823,449,380,7,7,465,464,153,140,998,323,795,596,2020-04-23
- 20027,12,13,14,24,25,28,06, ,810,989,197,3,10,000,000,112,211,713,324,196,702,2020-04-21
- 20026,01,04,09,18,19,31,06, ,769,853,394,3,10,000,000,106,181,297,349,633,986,2020-04-19
- 20025,01,09,10,15,20,29,02, ,742,200,788,4,9,977,229,156,159,526,321,611,812,2020-04-16
- 20024,10,13,27,28,29,31,08, ,707,451,262,8,7,028,971,94,215,848,317,568,426,2020-04-14
- 20023,04,05,20,21,30,33,08, ,702,813,893,6,7,277,843,107,159,661,340,318,270,2020-04-12
- 20022,02,10,20,22,25,30,02, ,695,229,480,8,6,761,570,97,181,605,311,416,184,2020-04-09
- 20021,02,06,14,16,27,31,07, ,696,474,938,5,7,581,452,116,139,086,296,655,800,2020-04-07
- 20020,01,06,09,13,15,22,14, ,685,979,970,4,9,871,526,91,267,666,310,764,432,2020-04-05
- 20019,15,19,27,28,30,33,03, ,652,393,178,7,7,437,091,60,355,409,293,920,314,2020-04-02
- 20018,05,07,08,11,17,22,13, ,640,479,172,8,6,340,865,188,71,322,287,194,532,2020-03-31
- 20017,02,04,07,15,20,27,04, ,650,980,126,15,5,992,154,127,146,479,295,712,918,2020-03-29
- 20016,05,06,08,17,24,27,07, ,685,053,744,12,5,979,822,89,165,138,260,860,736,2020-03-26
- 20015,08,09,22,24,30,33,01, ,712,719,613,1,10,000,000,60,337,347,255,551,054,2020-03-24
- 20014,02,07,08,10,12,16,07, ,661,997,100,5,6,947,355,66,184,408,261,933,168,2020-03-22
- 20013,02,08,10,20,21,30,14, ,660,220,960,4,7,500,138,184,67,938,215,545,590,2020-03-19
- 20012,04,13,14,23,26,31,09, ,652,719,440,3,8,598,462,112,120,484,198,501,652,2020-03-17
- 20011,04,05,07,17,18,29,01, ,638,032,121,8,6,079,680,67,161,146,176,331,724,2020-03-15
- 20010,11,14,22,27,31,32,09, ,654,279,153,2,8,485,717,42,207,483,142,522,854,2020-03-12
- 20009,03,06,08,14,19,26,12, ,645,107,708,16,6,003,258,253,79,308,405,733,380,2020-01-21
- 20008,01,04,06,10,11,28,16, ,680,964,320,17,6,202,629,262,97,541,395,048,088,2020-01-19
- 20007,05,12,17,20,25,31,10, ,709,741,377,38,5,371,408,269,65,583,358,035,962,2020-01-16
- 20006,03,04,05,10,16,32,09, ,860,929,224,20,5,735,328,363,50,642,361,697,316,2020-01-14
- 20005,11,16,17,22,26,32,04, ,920,486,146,8,7,446,888,224,109,236,400,642,476,2020-01-12
- 20004,02,15,17,27,32,33,03, ,906,654,598,7,7,900,679,189,134,290,370,826,158,2020-01-09
- 20003,09,17,26,29,30,32,03, ,885,816,521,16,6,237,758,134,184,740,368,960,938,2020-01-07
- 20002,04,09,14,15,16,29,11, ,911,355,134,16,6,188,650,194,122,541,410,784,878,2020-01-05
- 20001,02,15,23,26,29,30,02, ,939,054,504,4,10,000,000,113,275,836,388,311,860,2020-01-02
复制代码 倒叙排列,还需要优化。
小小修改一下,即可:- if __name__ == '__main__':
- file = open('ssq.txt', mode='a+', encoding='utf-8')
- localtime = time.localtime(time.time())
- lyear = localtime.tm_year
- ymin = 3 # 双色球03年上线
- ymax = lyear - 2000
- print('===抓取数据开始===,200%s-20%s' % (ymin, ymax))
- for year in range(ymin, ymax + 1):
- start = '{0}001'.format(year)
- end = '{0}300'.format(year)
- trs = request_content(start, end)
- # for tr in trs:
- for i in range(len(trs)):
- tr = trs[len(trs)-i-1]
-
- ssqobj = ssqclazz()
- ssqobj.tr_tag(tr)
- objstr = ssqobj.__str__()
-
- file.write(objstr)
- file.write('\n')
- print(objstr)
-
- # file.write('\n')
- print()
- time.sleep(3)
- file.close()
- print('抓取完毕!!!')
复制代码
参考:
【1】python3抓取-双色球开奖的所有历史数据-2003年始
|