First, we need to find the goal
Finding the target first analyze the webpage Luckily this one has only one webpage and does not require a page turn.
Second, F12 view page source code
Find the target and analyze how to get the data you need. Find href with movie title
Third, carry out the code implementation to get the desired resources.
''' Steps 1, get the url content 2,css select the content of its choice 3, save the data you need ''' # Import the packages needed for the crawler import requests from bs4 import BeautifulSoup #requests and BeautifulSoup for parsing web pages import time #Set the access time to the web page, to prevent their IP access to the more restricted denial of access import re class Position(): def __init__(self,position_name,position_require,):#Build object properties self.position_name=position_name self.position_require=position_require def __str__(self): return '%s%s/n'%(self.position_name,self.position_require)# Overloaded methods to change input variables to string form class Aiqiyi(): def iqiyi(self,url): head= { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47" } # Simulated server header html = (url,headers=head) #headers=hard Make the script accessible as a browser, there are some URLs that are forbidden by python's anti-crawl mechanism, this is one of them soup = BeautifulSoup(, 'lxml', from_encoding='utf-8') # BeautifulSoup to view web pages soupl = (".qy-list-wrap") # Find tags, use css selector, select the data you need to select the first content of the page (tags to find the only, find id good, if not consider other tags such as class) results = [] # Create a list to store the data for e in soupl: biao = ('.qy-mod-li') # Conduct secondary screening for h in biao: p=Position(h.select_one('.qy-mod-link-wrap').get_text(strip=True), h.select_one('.title-wrap').get_text(strip=True))# Call class conversion (continue filtering three times to select what you need) (p) return results # Return to content def address(self,url): # Save URL head = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47" } # Simulated server headers html = (url, headers=head) soup = BeautifulSoup(, 'lxml', from_encoding='utf-8') # BeautifulSoup to view web pages alist = ('div', class_='qy-list-wrap').find_all("a") # Find the a tag under the div block module ls=[] for i in alist: (('href')) return ls if __name__ == '__main__': (2) #Set up 2-second visits a=Aiqiyi() url = "/www/1/-------------" with open(file='e:/exercise.txt ', mode='a+') as f: # e:/exercise.txt is a new file created on my computer, a+ adds to the content but does not overwrite it. for item in (url): line = f'{item.position_name}\t{item.position_require}\n' (line) # Adoption of methodology print("Download complete.") with open(file='e:/address.txt ', mode='a+') as f: # e:/exercise.txt is a new file created on my computer, a+ adds to the content but does not overwrite it. for item in (url): line=f'https{item}\n' (line) # Adoption of methodology print("Download complete.")
IV. Viewing the phenomenon
to this article on Python fun crawler crawling love Qiyi popular movie article is introduced to this, more related Python crawl movie content please search for my previous posts or continue to browse the following related articles I hope you will support me in the future more!