SoFunction
Updated on 2024-10-29

python crawler build proxy ip pool crawl database sample code

Crawler partners, certainly often encountered ip is blocked, and now the network of free proxy ip has been difficult to find, then now use python's requests library from crawling proxy ip, to create a pool of ip proxies, in case of use.

This code includes ip crawling, detecting whether it is available or not, available to save, through the function get_proxies you can get the ip, such as: {'HTTPS': '106.12.7.54:8118'}

The source code is put below and commented in detail:

import requests
from lxml import etree
from  import urllib3
import random, time
 
urllib3.disable_warnings()
 
 
def spider(pages, max_change_porxies_times=300):
  """
  gripper  (used form a nominal expression) httptypology-act on behalf of sb. in a responsible positionip-and port number
 
  将所有gripper(used form a nominal expression)ipdeposit (e.g. in a bank account) raw_ips.csv awaiting processing, usability check_proxies() 检查爬取到(used form a nominal expression)act on behalf of sb. in a responsible positionip是否usability
  -----
  :param pages:要gripper多少leaf
  :return:No return
  """
  s = ()
  s.trust_env = False
   = False
  urls =com/nn/{}'
  proxies = {}
  try_times = 0
  for i in range(pages):
    url = (i + 1)
     = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'zh-CN,zh;q=0.9',
      'Connection': 'keep-alive',
      'Referer': (i if i > 0 else ''),
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}
    while True:
      content = (url, headers=, proxies=proxies)
      ((1.5, 4)) # Pause every time you read a page or you'll get blocked.
      if content.status_code == 503: # If the ip is blocked at 503, change the ip.
        proxies = get_proxies()
        try_times += 1
        print(f'(prefix indicating ordinal number, e.g. first, number two etc){str(try_times):0>3s}substitution,be facing (us){proxies}')
        if try_times > max_change_porxies_times:
          print('Maximum number of attempts exceeded, connection failed!')
          return -1
        continue
      else:
        break # If the return code is 200, jump out of the while loop and process the crawled page.
 
    print(f'正在gripper(prefix indicating ordinal number, e.g. first, number two etc){i+1}data sheet,common{pages}leaf')
    for j in range(2, 102): # Extract http, host and port with a simple xpath.
      tree = ()
      http = (f'//table[@]/tr[{j}]/td[6]/text()')[0]
      host = (f'//table[@]/tr[{j}]/td[2]/text()')[0]
      port = (f'//table[@]/tr[{j}]/td[3]/text()')[0]
      check_proxies(http, host, port) # Check if the extracted proxy ip is available
 
 
def check_proxies(http, host, port, test_url=''):
  """
  Detect if the given ip information is available

  Compose proxies based on http, host, and port, test_url for connectivity, and if it passes, save it in ips_pool.csv
  :param http: transport protocol type
  :param host: host
  :param port: port number
  :param test_url: test ip
  :return: None
  """
  proxies = {http: host + ':' + port}
  try:
    res = (test_url, proxies=proxies, timeout=2)
    if res.status_code == 200:
      print(f'{proxies}Test Passed')
      with open('ips_pool.csv', 'a+') as f:
        (','.join([http, host, port]) + '\n')
  except Exception as e: # If it doesn't pass, don't save it, don't let the error report interrupt the program. #
    print(e)
 
 
def check_local_ip(fn, test_url):
  """
  Check if the proxy ip stored in the local ip pool is available

  Tests the connection to test_url by reading fn, loading each ip and storing it in the ips_pool.csv file if the connection is successful.
  :param fn: filename, the name of the file where the proxy ip is stored.
  :param test_url: the ip to be tested
  :return: None
  """
  with open(fn, 'r') as f:
    datas = ()
    ip_pools = []
  for data in datas:
    # (1)
    ip_msg = ().split(',')
    http = ip_msg[0]
    host = ip_msg[1]
    port = ip_msg[2]
    proxies = {http: host + ':' + port}
    try:
      res = (test_url, proxies=proxies, timeout=2)
      if res.status_code == 200:
        ip_pools.append(data)
        print(f'{proxies}Test Passed')
        with open('ips_pool.csv', 'a+') as f:
          (','.join([http, host, port]) + '\n')
    except Exception as e:
      print(e)
      continue
 
 
def get_proxies(ip_pool_name='ips_pool.csv'):
  """
  through (a gap)ip池获得一个随机(used form a nominal expression)act on behalf of sb. in a responsible positionip
  :param ip_pool_name: str,leave in sb's careip池(used form a nominal expression)文件名,
  :return: Returns aproxiesdictionaries,appearing be:{'HTTPS': '106.12.7.54:8118'}
  """
  with open(ip_pool_name, 'r') as f:
    datas = ()
  ran_num = (datas)
  ip = ran_num.strip().split(',')
  proxies = {ip[0]: ip[1] + ':' + ip[2]}
  return proxies
 
 
if __name__ == '__main__':
  t1 = ()
  spider(pages=3400)
  t2 = ()
  print('Capture complete, time:', t2 - t1)
 
  # check_local_ip('raw_ips.csv','')

The above is python crawler build proxy ip pool to crawl the database sample code in detail, more information about python crawler build proxy ip pool please pay attention to my other related articles!