In this article, the example of Python to implement the crawler to crawl the NBA data function. Shared for your reference, as follows:
The site crawled is: and here is the data crawled for the NBA 2016-2017 regular season through January 7, 2017
Change url_header and url_tail to crawl specific other data.
The source code is below:
#coding=utf-8 import sys reload(sys) ('utf-8') import requests import time import urllib from bs4 import BeautifulSoup import re from pyExcelerator import * def getURLLists(url_header,url_tail,pages): """ Get a list of URLs for all pages """ url_lists = [] url_0 = url_header+'0'+url_tail print url_0 url_lists.append(url_0) for i in range(1,pages+1): url_temp = url_header+str(i)+url_tail url_lists.append(url_temp) return url_lists def getNBAAllData(url_lists): """ Get all the 2017 NBA regular season stats """ datasets = [''] for item in url_lists: data1 = getNBASingleData(item) (data1) # Remove empty elements from the data for item in datasets[:]: if len(item) == 0: (item) return datasets def getNBASingleData(url): """ Get 1 page of NBA regular season data """ # url = '/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017' # html = (url).text html = (url).read() # print html soup = BeautifulSoup(html) data = ('tbody').text list_data = ('\n') # with open('nba_data.txt','a') as fp: # (data) # for item in list_data[:]: # if len(item) == 0: # list_data.remove(item) return list_data def saveDataToExcel(datasets,sheetname,filename): book = Workbook() sheet = book.add_sheet(sheetname) (0,0,u'Serial number') (0,1,u'Teams') (0,2,u'Time') (0,3,u'Results') (0,4,u'Host and Guest') (0,5,u'The Match') (0,6,u'Shooting percentage') (0,7,u'Number of hits') (0,8,u'Number of strikes') (0,9,u'Three-point shooting percentage') (0,10,u'Three-point shots') (0,11,u'Number of three-point attempts') (0,12,u'Free throw percentage') (0,13,u'Free throw attempts') (0,14,u'Free throw attempts') (0,15,u'Rebounds') (0,16,u'Frontcourt rebound') (0,17,u'Backcourt Rebounds') (0,18,u'Assists') (0,19,u'Snatch') (0,20,u'Caps') (0,21,u'Mistakes') (0,22,u'Foul play') (0,23,u'Score') num = 24 row_cnt = 0 data_cnt = 0 data_len = len(datasets) print 'data_len:',data_len while(data_cnt< data_len): row_cnt += 1 print 'Serial number:',row_cnt for col in range(num): # print col (row_cnt,col,datasets[data_cnt]) data_cnt += 1 (filename) def writeDataToTxt(datasets): fp = open('nba_data.txt','w') line_cnt = 1 for i in range(len(datasets)-1): #Team name alignment operation: if team name is too short or 76ers yes add two tables after team name else add 1 table if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u'Philadelphia 76ers': (datasets[i]+'\t\t') else: (datasets[i]+'\t') line_cnt += 1 if line_cnt % 24 == 1: ('\n') () if __name__ == "__main__": pages = int(1132/150) url_header = '/query_team.php?page=' url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result' url_lists = getURLLists(url_header,url_tail,pages) datasets = getNBAAllData(url_lists) writeDataToTxt(datasets) sheetname = 'nba normal data 2016-2017' str_time = ('%Y-%m-%d',(())) filename = 'nba_normal_data'+str_time+'.xls' saveDataToExcel(datasets,sheetname,filename)
More about Python related content can be viewed on this site's topic: thePython Socket Programming Tips Summary》、《Python Regular Expression Usage Summary》、《Python Data Structures and Algorithms Tutorial》、《Summary of Python function usage tips》、《Summary of Python string manipulation techniques》、《Python introductory and advanced classic tutorialsand theSummary of Python file and directory manipulation techniques》
I hope the description of this article will help you in Python programming.