Python crawler to crawl the NBA data function example

In this article, the example of Python to implement the crawler to crawl the NBA data function. Shared for your reference, as follows:

The site crawled is: and here is the data crawled for the NBA 2016-2017 regular season through January 7, 2017

Change url_header and url_tail to crawl specific other data.

The source code is below:

#coding=utf-8
import sys
reload(sys)
('utf-8')
import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *
def getURLLists(url_header,url_tail,pages):
  """
  Get a list of URLs for all pages
  """
  url_lists = []
  url_0 = url_header+'0'+url_tail
  print url_0
  url_lists.append(url_0)
  for i in range(1,pages+1):
    url_temp = url_header+str(i)+url_tail
    url_lists.append(url_temp)
  return url_lists
def getNBAAllData(url_lists):
  """
  Get all the 2017 NBA regular season stats
  """
  datasets = ['']
  for item in url_lists:
    data1 = getNBASingleData(item)
    (data1)
  # Remove empty elements from the data
  for item in datasets[:]:
    if len(item) == 0:
      (item)
  return datasets
def getNBASingleData(url):
  """
  Get 1 page of NBA regular season data
  """
  # url = '/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
  # html = (url).text
  html = (url).read()
  # print html
  soup = BeautifulSoup(html)
  data = ('tbody').text
  list_data = ('\n')
  # with open('nba_data.txt','a') as fp:
  #   (data)
  # for item in list_data[:]:
  #   if len(item) == 0:
  #     list_data.remove(item)
  return list_data
def saveDataToExcel(datasets,sheetname,filename):
  book = Workbook()
  sheet = book.add_sheet(sheetname)
  (0,0,u'Serial number')
  (0,1,u'Teams')
  (0,2,u'Time')
  (0,3,u'Results')
  (0,4,u'Host and Guest')
  (0,5,u'The Match')
  (0,6,u'Shooting percentage')
  (0,7,u'Number of hits')
  (0,8,u'Number of strikes')
  (0,9,u'Three-point shooting percentage')
  (0,10,u'Three-point shots')
  (0,11,u'Number of three-point attempts')
  (0,12,u'Free throw percentage')
  (0,13,u'Free throw attempts')
  (0,14,u'Free throw attempts')
  (0,15,u'Rebounds')
  (0,16,u'Frontcourt rebound')
  (0,17,u'Backcourt Rebounds')
  (0,18,u'Assists')
  (0,19,u'Snatch')
  (0,20,u'Caps')
  (0,21,u'Mistakes')
  (0,22,u'Foul play')
  (0,23,u'Score')
  num = 24
  row_cnt = 0
  data_cnt = 0
  data_len = len(datasets)
  print 'data_len:',data_len
  while(data_cnt< data_len):
    row_cnt += 1
    print 'Serial number:',row_cnt
    for col in range(num):
        # print col
        (row_cnt,col,datasets[data_cnt])
        data_cnt += 1
  (filename)
def writeDataToTxt(datasets):
  fp = open('nba_data.txt','w')
  line_cnt = 1
  for i in range(len(datasets)-1):
    #Team name alignment operation: if team name is too short or 76ers yes add two tables after team name else add 1 table
    if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u'Philadelphia 76ers':
      (datasets[i]+'\t\t')
    else:
      (datasets[i]+'\t')
    line_cnt += 1
    if line_cnt % 24 == 1:
      ('\n')
  ()
if __name__ == "__main__":
  pages = int(1132/150)
  url_header = '/query_team.php?page='
  url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
  url_lists = getURLLists(url_header,url_tail,pages)
  datasets = getNBAAllData(url_lists)
  writeDataToTxt(datasets)
  sheetname = 'nba normal data 2016-2017'
  str_time = ('%Y-%m-%d',(()))
  filename = 'nba_normal_data'+str_time+'.xls'
  saveDataToExcel(datasets,sheetname,filename)

More about Python related content can be viewed on this site's topic: thePython Socket Programming Tips Summary》、《Python Regular Expression Usage Summary》、《Python Data Structures and Algorithms Tutorial》、《Summary of Python function usage tips》、《Summary of Python string manipulation techniques》、《Python introductory and advanced classic tutorialsand theSummary of Python file and directory manipulation techniques》

I hope the description of this article will help you in Python programming.