Hello Mat

 找回密码
 立即注册
查看: 6008|回复: 7

python抓取双色球历史数据

[复制链接]

1319

主题

1547

帖子

0

金钱

管理员

Rank: 9Rank: 9Rank: 9

积分
22631
发表于 2020-9-13 22:27:27 | 显示全部楼层 |阅读模式
代码:
  1. from requests import get
  2. from bs4 import BeautifulSoup
  3. from user_agent import generate_user_agent
  4. import time

  5. def request_content(start, end):
  6.     url_link = 'https://datachart.500.com/ssq/history/newinc/history.php?start={0}&end={1}'.format(start, end)
  7.     headers = {
  8.         'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux', 'win', 'android'))
  9.     }
  10.     response = get(url_link, headers=headers, timeout=6)
  11.     page_content = BeautifulSoup(response.content, "html.parser")
  12.     html_tag = page_content.find_all('tbody', id='tdata')[0]
  13.     return html_tag.find_all('tr', 't_tr1')

  14. class ssqclazz:
  15.     def __init__(self):
  16.         self.period = ''  # 期号
  17.         self.red_1 = ''  # 红球
  18.         self.red_2 = ''
  19.         self.red_3 = ''
  20.         self.red_4 = ''
  21.         self.red_5 = ''
  22.         self.red_6 = ''
  23.         self.blue_1 = ''  # 蓝球
  24.         self.happy_sunday = ''  # 快乐星期天
  25.         self.pool_prize = ''  # 奖池奖金(元)
  26.         self.first_count = ''  # 一等奖 注数
  27.         self.first_prize = ''  # 一等奖 奖金(元)
  28.         self.second_count = ''  # 二等奖 注数
  29.         self.second_prize = ''  # 二等奖 奖金(元)
  30.         self.total_prize = ''  # 总投注额(元)
  31.         self.lottery_date = ''  # 开奖日期


  32.     def __str__(self):
  33.         return '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}'.format(self.period, self.red_1,
  34.                                                                                               self.red_2, self.red_3,
  35.                                                                                               self.red_4, self.red_5,
  36.                                                                                               self.red_6,
  37.                                                                                               self.blue_1,
  38.                                                                                               self.happy_sunday,
  39.                                                                                               self.pool_prize,
  40.                                                                                               self.first_count,
  41.                                                                                               self.first_prize,
  42.                                                                                               self.second_count,
  43.                                                                                               self.second_prize,
  44.                                                                                               self.total_prize,
  45.                                                                                               self.lottery_date)
  46.         
  47.     def tr_tag(self, tag):
  48.         tds = tag.find_all('td')
  49.         index = 0
  50.         self.period = tds[index].string
  51.         index += 1
  52.         self.red_1 = tds[index].string
  53.         index += 1
  54.         self.red_2 = tds[index].string
  55.         index += 1
  56.         self.red_3 = tds[index].string
  57.         index += 1
  58.         self.red_4 = tds[index].string
  59.         index += 1
  60.         self.red_5 = tds[index].string
  61.         index += 1
  62.         self.red_6 = tds[index].string
  63.         index += 1
  64.         self.blue_1 = tds[index].string
  65.         index += 1
  66.         self.happy_sunday = tds[index].string
  67.         index += 1
  68.         self.pool_prize = tds[index].string
  69.         index += 1
  70.         self.first_count = tds[index].string
  71.         index += 1
  72.         self.first_prize = tds[index].string
  73.         index += 1
  74.         self.second_count = tds[index].string
  75.         index += 1
  76.         self.second_prize = tds[index].string
  77.         index += 1
  78.         self.total_prize = tds[index].string
  79.         index += 1
  80.         self.lottery_date = tds[index].string

  81. if __name__ == '__main__':
  82.     file = open('ssq.txt', mode='a+', encoding='utf-8')
  83.     localtime = time.localtime(time.time())
  84.     lyear = localtime.tm_year
  85.     ymin = 3  # 双色球03年上线
  86.     ymax = lyear - 2000
  87.     print('===抓取数据开始===,200%s-20%s' % (ymin, ymax))
  88.     for year in range(ymin, ymax + 1):
  89.         start = '{0}001'.format(year)
  90.         end = '{0}300'.format(year)
  91.         trs = request_content(start, end)
  92.         for tr in trs:
  93.             ssqobj = ssqclazz()
  94.             ssqobj.tr_tag(tr)
  95.             objstr = ssqobj.__str__()
  96.             file.write(objstr)
  97.             file.write('\n')
  98.             print(objstr)
  99.         file.write('\n')
  100.         print()
  101.         time.sleep(3)
  102.     file.close()
  103.     print('抓取完毕!!!')
复制代码
节取2020年的数据如下:
  1. 20089,02,14,16,21,29,30,10, ,0,0,0,0,0,0,2020-09-13
  2. 20088,01,06,12,26,29,30,12, ,1,049,789,346,14,6,377,823,175,137,782,359,763,320,2020-09-10
  3. 20087,11,15,20,23,25,33,10, ,1,066,743,130,5,7,940,558,44,417,692,351,393,474,2020-09-08
  4. 20086,02,04,06,15,24,27,06, ,1,051,310,454,7,7,452,652,117,183,424,383,150,704,2020-09-06
  5. 20085,01,02,05,09,19,24,16, ,1,039,096,893,3,10,000,000,135,175,949,349,243,256,2020-09-03
  6. 20084,03,07,16,17,23,30,07, ,997,837,500,15,5,641,465,265,45,386,340,223,360,2020-09-01
  7. 20083,01,19,25,26,30,31,12, ,1,046,377,044,7,7,916,189,96,265,798,375,628,538,2020-08-30
  8. 20082,02,08,11,17,21,30,09, ,1,025,240,399,22,5,094,481,320,8,119,339,738,944,2020-08-27
  9. 20081,01,05,13,14,27,33,15, ,1,129,524,256,11,7,057,769,108,261,984,341,884,184,2020-08-25
  10. 20080,14,15,18,22,31,33,01, ,1,122,276,717,2,10,000,000,70,397,581,375,199,530,2020-08-23
  11. 20079,05,12,20,21,22,29,14, ,1,058,784,536,6,8,578,055,183,146,641,347,031,342,2020-08-20
  12. 20078,03,11,14,16,21,32,04, ,1,029,746,619,1,10,000,000,76,347,407,339,766,446,2020-08-18
  13. 20077,03,10,16,21,25,27,12, ,960,537,693,27,5,841,121,326,87,079,376,430,570,2020-08-16
  14. 20076,10,15,16,18,20,27,06, ,1,033,084,399,8,6,895,903,82,231,207,346,015,916,2020-08-13
  15. 20075,03,11,13,20,24,30,16, ,1,031,374,526,1,10,000,000,143,155,685,342,593,198,2020-08-11
  16. 20074,04,08,09,13,19,33,12, ,974,585,503,5,8,629,293,107,211,991,375,478,220,2020-08-09
  17. 20073,05,07,11,13,27,29,03, ,949,682,723,8,7,005,129,129,155,436,344,984,126,2020-08-06
  18. 20072,06,08,10,15,17,26,04, ,945,569,865,7,7,525,472,104,212,479,335,806,408,2020-08-04
  19. 20071,09,11,12,13,22,23,08, ,931,954,524,12,6,156,653,133,130,449,363,190,440,2020-08-02
  20. 20070,01,02,04,06,19,21,15, ,953,784,945,23,5,829,067,100,238,356,343,442,594,2020-07-30
  21. 20069,03,09,10,13,18,26,04, ,1,016,346,427,7,7,827,856,99,249,936,343,151,528,2020-07-28
  22. 20068,12,16,21,26,27,32,10, ,996,910,195,17,5,430,373,128,71,448,376,630,286,2020-07-26
  23. 20067,04,07,09,23,27,30,08, ,1,061,790,227,10,6,979,941,140,176,780,353,153,516,2020-07-23
  24. 20066,02,09,13,17,26,28,07, ,1,057,341,835,15,6,006,277,137,137,720,352,149,702,2020-07-21
  25. 20065,09,15,18,21,23,26,08, ,1,090,832,907,5,8,838,315,89,269,544,381,062,334,2020-07-19
  26. 20064,01,03,07,21,27,32,01, ,1,063,056,063,1,10,000,000,77,351,356,356,057,386,2020-07-16
  27. 20063,12,15,16,22,29,32,14, ,991,892,696,3,10,000,000,103,289,264,357,076,054,2020-07-14
  28. 20062,10,14,17,22,26,27,05, ,932,509,919,5,8,901,954,94,259,438,391,827,962,2020-07-12
  29. 20061,08,17,24,26,27,31,04, ,903,858,043,1,10,000,000,91,281,269,366,447,904,2020-07-09
  30. 20060,05,09,14,20,24,30,08, ,837,071,557,38,5,238,209,554,20,424,356,937,202,2020-07-07
  31. 20059,02,04,10,17,22,25,14, ,1,002,178,704,4,10,000,000,190,147,308,386,807,160,2020-07-05
  32. 20058,01,03,11,12,19,26,07, ,958,212,978,18,5,569,985,136,94,299,356,052,050,2020-07-02
  33. 20057,09,14,21,23,26,32,03, ,1,019,998,705,3,10,000,000,113,193,145,349,147,892,2020-06-30
  34. 20056,02,05,08,12,26,31,14, ,984,522,456,17,6,280,167,186,146,255,380,233,782,2020-06-28
  35. 20055,01,05,07,23,28,30,12, ,1,009,674,637,14,5,761,052,198,67,264,327,140,694,2020-06-25
  36. 20054,03,10,19,25,26,31,02, ,1,050,374,135,11,6,596,678,210,104,544,338,817,016,2020-06-23
  37. 20053,02,14,15,16,32,33,01, ,1,057,074,616,19,6,277,278,360,84,264,381,666,534,2020-06-21
  38. 20052,02,08,13,29,32,33,15, ,1,085,336,824,2,10,000,000,53,568,303,352,815,504,2020-06-18
  39. 20051,03,06,08,11,19,28,08, ,1,014,976,572,23,5,442,583,268,47,478,342,968,302,2020-06-16
  40. 20050,04,09,17,20,32,33,15, ,1,101,983,169,5,9,664,099,90,323,895,379,409,292,2020-06-14
  41. 20049,01,07,09,12,18,22,04, ,1,062,851,807,7,7,652,110,140,165,756,355,402,476,2020-06-11
  42. 20048,12,14,18,23,30,32,02, ,1,046,798,674,6,8,364,818,75,336,481,351,604,616,2020-06-09
  43. 20047,04,10,17,19,28,32,01, ,1,021,279,165,10,7,200,661,140,196,487,382,084,220,2020-06-07
  44. 20046,13,19,25,26,27,32,08, ,1,010,760,962,7,6,855,509,87,186,617,350,122,868,2020-06-04
  45. 20045,02,03,15,21,22,33,16, ,1,010,042,406,16,6,075,531,115,187,048,348,295,556,2020-06-02
  46. 20044,07,08,16,17,19,24,07, ,1,042,719,025,10,6,803,090,107,210,641,379,328,476,2020-05-31
  47. 20043,01,11,24,25,27,30,07, ,1,043,134,035,5,7,928,059,64,285,943,360,394,276,2020-05-28
  48. 20042,02,06,07,11,14,31,03, ,1,027,873,219,5,8,415,115,98,217,800,345,472,844,2020-05-26
  49. 20041,06,07,16,22,24,25,13, ,1,005,915,378,2,10,000,000,102,255,217,372,422,582,2020-05-24
  50. 20040,01,06,12,16,19,21,04, ,947,818,891,5,8,416,563,92,232,103,341,533,118,2020-05-21
  51. 20039,02,09,10,11,16,29,02, ,925,841,149,9,6,952,125,166,132,297,341,609,556,2020-05-19
  52. 20038,01,06,07,18,23,24,15, ,922,526,038,15,6,276,646,317,75,511,374,760,420,2020-05-17
  53. 20037,01,04,11,13,17,24,15, ,944,864,355,6,6,810,277,65,208,878,350,126,486,2020-05-14
  54. 20036,07,09,16,22,24,32,06, ,944,994,783,3,10,000,000,104,204,187,346,455,736,2020-05-12
  55. 20035,10,14,24,25,28,33,11, ,911,288,358,4,9,777,104,97,246,242,371,811,232,2020-05-10
  56. 20034,02,08,15,16,26,32,03, ,878,740,204,11,6,780,706,182,134,531,340,793,680,2020-05-07
  57. 20033,07,10,12,21,31,32,01, ,879,873,846,4,10,000,000,97,263,480,320,555,362,2020-05-05
  58. 20032,03,11,13,14,15,26,13, ,843,200,936,11,6,588,951,135,161,837,338,904,002,2020-05-03
  59. 20031,02,05,09,15,16,27,09, ,850,135,149,14,5,603,722,310,34,081,323,903,392,2020-04-30
  60. 20030,17,18,21,29,30,32,03, ,896,891,829,1,10,000,000,74,322,591,327,786,632,2020-04-28
  61. 20029,01,12,18,20,30,32,05, ,835,276,431,7,7,432,574,63,337,857,364,268,814,2020-04-26
  62. 20028,05,06,15,18,26,32,08, ,823,449,380,7,7,465,464,153,140,998,323,795,596,2020-04-23
  63. 20027,12,13,14,24,25,28,06, ,810,989,197,3,10,000,000,112,211,713,324,196,702,2020-04-21
  64. 20026,01,04,09,18,19,31,06, ,769,853,394,3,10,000,000,106,181,297,349,633,986,2020-04-19
  65. 20025,01,09,10,15,20,29,02, ,742,200,788,4,9,977,229,156,159,526,321,611,812,2020-04-16
  66. 20024,10,13,27,28,29,31,08, ,707,451,262,8,7,028,971,94,215,848,317,568,426,2020-04-14
  67. 20023,04,05,20,21,30,33,08, ,702,813,893,6,7,277,843,107,159,661,340,318,270,2020-04-12
  68. 20022,02,10,20,22,25,30,02, ,695,229,480,8,6,761,570,97,181,605,311,416,184,2020-04-09
  69. 20021,02,06,14,16,27,31,07, ,696,474,938,5,7,581,452,116,139,086,296,655,800,2020-04-07
  70. 20020,01,06,09,13,15,22,14, ,685,979,970,4,9,871,526,91,267,666,310,764,432,2020-04-05
  71. 20019,15,19,27,28,30,33,03, ,652,393,178,7,7,437,091,60,355,409,293,920,314,2020-04-02
  72. 20018,05,07,08,11,17,22,13, ,640,479,172,8,6,340,865,188,71,322,287,194,532,2020-03-31
  73. 20017,02,04,07,15,20,27,04, ,650,980,126,15,5,992,154,127,146,479,295,712,918,2020-03-29
  74. 20016,05,06,08,17,24,27,07, ,685,053,744,12,5,979,822,89,165,138,260,860,736,2020-03-26
  75. 20015,08,09,22,24,30,33,01, ,712,719,613,1,10,000,000,60,337,347,255,551,054,2020-03-24
  76. 20014,02,07,08,10,12,16,07, ,661,997,100,5,6,947,355,66,184,408,261,933,168,2020-03-22
  77. 20013,02,08,10,20,21,30,14, ,660,220,960,4,7,500,138,184,67,938,215,545,590,2020-03-19
  78. 20012,04,13,14,23,26,31,09, ,652,719,440,3,8,598,462,112,120,484,198,501,652,2020-03-17
  79. 20011,04,05,07,17,18,29,01, ,638,032,121,8,6,079,680,67,161,146,176,331,724,2020-03-15
  80. 20010,11,14,22,27,31,32,09, ,654,279,153,2,8,485,717,42,207,483,142,522,854,2020-03-12
  81. 20009,03,06,08,14,19,26,12, ,645,107,708,16,6,003,258,253,79,308,405,733,380,2020-01-21
  82. 20008,01,04,06,10,11,28,16, ,680,964,320,17,6,202,629,262,97,541,395,048,088,2020-01-19
  83. 20007,05,12,17,20,25,31,10, ,709,741,377,38,5,371,408,269,65,583,358,035,962,2020-01-16
  84. 20006,03,04,05,10,16,32,09, ,860,929,224,20,5,735,328,363,50,642,361,697,316,2020-01-14
  85. 20005,11,16,17,22,26,32,04, ,920,486,146,8,7,446,888,224,109,236,400,642,476,2020-01-12
  86. 20004,02,15,17,27,32,33,03, ,906,654,598,7,7,900,679,189,134,290,370,826,158,2020-01-09
  87. 20003,09,17,26,29,30,32,03, ,885,816,521,16,6,237,758,134,184,740,368,960,938,2020-01-07
  88. 20002,04,09,14,15,16,29,11, ,911,355,134,16,6,188,650,194,122,541,410,784,878,2020-01-05
  89. 20001,02,15,23,26,29,30,02, ,939,054,504,4,10,000,000,113,275,836,388,311,860,2020-01-02
复制代码
倒叙排列,还需要优化。
小小修改一下,即可:
  1. if __name__ == '__main__':
  2.     file = open('ssq.txt', mode='a+', encoding='utf-8')
  3.     localtime = time.localtime(time.time())
  4.     lyear = localtime.tm_year
  5.     ymin = 3  # 双色球03年上线
  6.     ymax = lyear - 2000
  7.     print('===抓取数据开始===,200%s-20%s' % (ymin, ymax))
  8.     for year in range(ymin, ymax + 1):
  9.         start = '{0}001'.format(year)
  10.         end = '{0}300'.format(year)
  11.         trs = request_content(start, end)
  12. #        for tr in trs:
  13.         for i in range(len(trs)):
  14.             tr = trs[len(trs)-i-1]
  15.             
  16.             ssqobj = ssqclazz()
  17.             ssqobj.tr_tag(tr)
  18.             objstr = ssqobj.__str__()
  19.             
  20.             file.write(objstr)
  21.             file.write('\n')
  22.             print(objstr)
  23.         
  24. #        file.write('\n')
  25.         print()
  26.         time.sleep(3)
  27.     file.close()
  28.     print('抓取完毕!!!')
复制代码


参考:
【1】python3抓取-双色球开奖的所有历史数据-2003年始





算法QQ  3283892722
群智能算法链接http://halcom.cn/forum.php?mod=forumdisplay&fid=73
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

Python|Opencv|MATLAB|Halcom.cn ( 蜀ICP备16027072号 )

GMT+8, 2024-10-31 18:21 , Processed in 0.213152 second(s), 25 queries .

Powered by Discuz! X3.4

Copyright © 2001-2021, Tencent Cloud.

快速回复 返回顶部 返回列表