1. Get pictures
import re import time import ddddocr import requests from selenium import webdriver from import By from import Service from import WebDriverWait from import ActionChains service = Service("driver/") driver = (service=service) # 1. Open the home page('/adaptive-captcha-demo') # 2. Click [Text click Verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( , '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) () # 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) () (5) # The target image to be identifiedtarget_tag = driver.find_element( By.CLASS_NAME, 'geetest_ques_back' ) target_tag.screenshot("") # Identify picturesbg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) bg_tag.screenshot("") (2000) ()
2. Target recognition
Screenshots each character and recognizes it based on ddddocr.
import re import time import ddddocr import requests from selenium import webdriver from import By from import Service from import WebDriverWait from import ActionChains service = Service("driver/") driver = (service=service) # 1. Open the home page('/adaptive-captcha-demo') # 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( , '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) () # 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) () # 4. Wait for the verification code to come out(5) # 5. Identify task picturestarget_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = (show_ad=False) word = (tag.screenshot_as_png) target_word_list.append(word) print("Text to be identified:", target_word_list) (2000) ()
3. Background coordinate recognition
3.1 ddddocr
It can be recognized, but found that the default recognition rate is a bit low. If you want to improve the recognition rate, you can build itPytorch
The environment trains the model, refer to: /sml2h3/dddd_trainer
import re import time import ddddocr import requests from selenium import webdriver from import By from import Service from import WebDriverWait from import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/") driver = (service=service) # 1. Open the home page('/adaptive-captcha-demo') # 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( , '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) () # 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) () # 4. Wait for the verification code to come out(5) # 5. Identify task picturestarget_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = (show_ad=False) word = (tag.screenshot_as_png) target_word_list.append(word) print("Text to be identified:", target_word_list) # 6. Background picturebg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png # 7. Identify all text in the background and get coordinatesocr = (show_ad=False, det=True) poses = (content) # [(x1, y1, x2, y2), (x1, y1, x2, y2), x1, y1, x2, y2] # 8. Loop each text in the coordinates and identify itbg_word_dict = {} img = (BytesIO(content)) for box in poses: x1, y1, x2, y2 = box # Get the picture of each text according to the coordinates corp = (box) img_byte = BytesIO() (img_byte, 'png') # Identify text ocr2 = (show_ad=False) word = (img_byte.getvalue()) # Low recognition rate # Get the coordinates of each word {"Duck":} bg_word_dict[word] = [int((x1 + x2) / 2), int((y1 + y2) / 2)] print(bg_word_dict) (1000) ()
3.2 Coding platform
/
import base64 import requests from hashlib import md5 file_bytes = open('', 'rb').read() res = ( url='/Upload/', data={ 'user': "deng", 'pass2': md5("password".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(file_bytes) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = () print(res_dict) # {'err_no': 0, 'err_str': 'OK', 'pic_id': '1234612060701120002', 'pic_str': ',86,73|Flour,111,38|Dish,40,49|Fragrant,198,101', 'md5': 'faac71fc832b2ead01ffb4e813f3be60'}
Combined with screenshots of the extreme case + identification:
import re import time import ddddocr import requests import base64 import requests from hashlib import md5 from selenium import webdriver from import By from import Service from import WebDriverWait from import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/") driver = (service=service) # 1. Open the home page('/adaptive-captcha-demo') # 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( , '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) () # 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) () # 4. Wait for the verification code to come out(5) # 5. Identify task picturestarget_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = (show_ad=False) word = (tag.screenshot_as_png) target_word_list.append(word) print("Text to be identified:", target_word_list) # 6. Background picturebg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png bg_tag.screenshot("") # 7. Identify all text in the background and get coordinatesres = ( url='/Upload/', data={ 'user': "deng", 'pass2': md5("password".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(content) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = () print(res_dict) # 8. Coordinates of each word {"Duck":(196,85), ...} target_word_list = ["Flower","Duck","word"]bg_word_dict = {} for item in res_dict["pic_str"].split("|"): word, x, y = (",") bg_word_dict[word] = (x, y) print(bg_word_dict) (1000) ()
4. Coordinate click
Click on the verification code according to the coordinates.
ActionChains(driver).move_to_element_with_offset(Tag Object, xoffset=x, yoffset=y).click().perform()
import re import time import ddddocr import requests import base64 import requests from hashlib import md5 from selenium import webdriver from import By from import Service from import WebDriverWait from import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/") driver = (service=service) # 1. Open the home page('/adaptive-captcha-demo') # 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( , '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) () # 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) () # 4. Wait for the verification code to come out(5) # 5. Identify task picturestarget_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = (show_ad=False) word = (tag.screenshot_as_png) target_word_list.append(word) print("Text to be identified:", target_word_list) # 6. Background picturebg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png # bg_tag.screenshot("") # 7. Identify all text in the background and get coordinatesres = ( url='/Upload/', data={ 'user': "deng", 'pass2': md5("Own password".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(content) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = () bg_word_dict = {} for item in res_dict["pic_str"].split("|"): word, x, y = (",") bg_word_dict[word] = (x, y) print(bg_word_dict) # target_word_list = ['Pink', 'Dish', 'Scent']# bg_word_dict = {'Pink': ('10', '10'), 'Dish': ('50', '50'), 'Scent': ('100', '93')}# 8. Clickfor word in target_word_list: (2) group = bg_word_dict.get(word) if not group: continue x, y = group x = int(x) - int(bg_tag.size['width'] / 2) y = int(y) - int(bg_tag.size['height'] / 2) ActionChains(driver).move_to_element_with_offset(bg_tag, xoffset=x, yoffset=y).click().perform() (1000) ()
5. Image verification code
In many login, registration, frequent operations, etc., the verification code function is generally added.
If you want to implement certain functions based on the code, you must implement: automatically identify the verification code and then do other functions.
6. Identification
Python-based modulesddddocr
It can realize the identification of image verification codes.
pip3.11 install ddddocr==1.4.9 -i /pypi/simple/ pip3.11 install Pillow==9.5.0
pip install ddddocr==1.4.9 -i /pypi/simple/ pip install Pillow==9.5.0
6.1 Local Identification
import ddddocr ocr = (show_ad=False) with open("img/", mode='rb') as f: body = () code = (body) print(code)
6.2 Online identification
You can also directly request to obtain the picture and then directly identify it:
import ddddocr import requests res = (url="/captcha/create/reg?_t=1701511836608") ocr = (show_ad=False) code = () print(code)
import ddddocr import requests res = ( url=f"/api/auth/captcha?captcha_token=n5A6VXIsMiI4MTKoco0VigkZbByJbDahhRHGNJmS" ) ocr = (show_ad=False) code = () print(code)
6.3 base64
The pictures of some platforms exist in base64 encoding and need to be processed and identified.
import base64 import ddddocr content = base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAGQAAAAoCAYAAAAIeF9DAAAHGElEQVR4Xu2a2VNTZxTAHZ/62of+BX3rdPrUmaq1da3WQWur1mqntrQWLe7UkUoQlEWFqFDZZN8hUBWKQUVpQDCyVUeltVWIIiAEZHWBAEk4zffZe+bmS+6SEEzE/GbOkHvPuXeY85t7vyWZBV48ilnsCS/uxSvEw3hthJydXWITnsiMFyLWfLGcu5jRQuQ2W27dy8ArBOTXvQxmrBBHm+xo/XQhKkTffRu0NSfgt8KvISttGaQlfQyFOWtBXboDbt7Ig6HBdvYSj6C7awDC3kqGg4oC8N0YC6uWhcPCOUHgszQMtvudgsK8Gnj2dNTqGmeErH47Z8rBYlfI6MgAXLqwH5Lj50iGJzL//UDJWP1pBDTfasNrnBEyHdgIef6sF1R5X9o0Xig8Ebb5QrF8QajlLTDoMTIIVkLMZiOcKfrWquGXLyqgo70BDKNDNG8wDFleCTehsS4JivK/4l/uMWxco4T4WDUo30yH+zo9PH0yavn/x+nnuBg1LPhgP0qJjjzjuUJuNGWiiJSEedByt4KffiWxt9bIz65GISveO2CVczcoxGQah+y05SikqT6ZX/fKw4khkfNGEQpZNDeILXUrKKT13mWUkZmyBIxGA7/O5XT2PoL0c9kQcPIX+D5yK2w/vgcis6Oh5qYWJicnac2msM0YrmRw4BkK+cwyA3OEuLh4CApS0GhoaGTTNtTXN2B9fHwCm7YBhdRWR6GQK5rD/BqXU9FQCb4RflYN50d0fgwYxsemTUjJ6ToUEh6iYtOi1NXVY4MTEhLZtA2khquvr69n0zagEP5gfu/uBXruga4aykr8ISv1E0g/tRBUueug+o8I6NH/hTdwFO3tOhsB9iK5NN2lQsbHjZbJSR9kplbC4nkKKsNnySHoejTAlooyOjoKisAXDSYRN2cvW4Lo9XqsCwkJpddKgUJyM3xQSH9fC9RUHbWabbFBnigy63KEEcMIbI3agU0mrynN9WoYfDoIJrOJ/iXH5DwryFnYqS4XP26Kg86OPrZcklPvBIBKVYSNVqvL6Tl7qNVqrCsqKmbTdkEhaUkfYbOv1cbaCLAXNVVR/HtJQl5VXIO3Ru+E3sHHbAmFnCf56RKyb3cmPLjfw5bKgjS/tVWHjY6IiISkd22FmEwmmuPqdDodW2IXFJKaOB8bnZr4IRQXbITWlst01U6ehJGRfnpM1h58KY68vo4VxGKD1doXr0UhSH66hHCx/+dsePJkhL1EkiSLlJDdQdjs5mbbHjQ3N2NeqTyGExUpUAgZJ7gmny32BeOE/ffdhOX8adUmrK2qlD9L2RWzFxvc3adn01Z09XW7RAiH0WiCx73DoKm8DVt8E1DK+tVRTknRaDTY8MzMLDZNz3F5jaaKTQuCQgpy1mCTH3U08Wts6LSs3Lnawty1bFoQMr3lGjxhnGDTVpC8K4XwMZvNELwvF6XEnTjHlkgyPDwMCkUwbTj5S47l5KRAIefL9mCThZ4ODpLnasnsSy6eIoTQ/vAxClm36iibloXQUyD19IiBQhquJTgnJHkRmxbEna8slgnLNJgT4uxq3d44QYJ8FhtfxEAhXZ3XscmdHeIrULLZyNWq8tazaUHcMagL0XKvC4WQ70ucwXYmdZ/OprhjkiM1joBCJifNkJe5kjaZDuoCWyfsoO7I1Ncd0157jI1NwC7/FBQSGJDFlsiGv9YoLi6m6w3uuLy8nC2XxGq395+/S7HRZNqra6m0rC4HLYOgif4lx8X5G7AmOX4uDPTLm18T7C0Mq65fsSwIh/5fGA7R46ksDDd8oYSTx89BnfZfut1O9q1MJjPdfm970EO3Tcj2PH/6e7XmDnsb2bCrcRLcsV7v+FrHSgh5SviDu1RUrAyB1tlX+beQRO7WSUppBn7+LtyPvY0g/EbLicOH5K2gxeDvV3GRmJjElsnCSgiBDNgV5wNtms8P8l3Jn41pluoXix0ixRExcjYXh58/weOflLvYWwjCNlwoyECennzJ8vTLW7CJQXZ9WSGNjeLjsBA2Qjja27RQeTEYcsJW0G2VjJTF9McO2toYwR83OCKFbL+nlWVBwK+BdDq87Zj19jvJc0ICE4PZywUhP3BQ/94E4QdU8MM3J+HzFZH0Bw5L5wfDGp8jsHdnBuRlVdNFoqswGAwQGnoQZZDPY2NjbJksBIUQHGkwhzPX2KPkShkKSTqbwqZnLIJCptLYqVxLaNd3gN/RbSik9paWLZmxuEVIcMohKL92EVo6dNA/PABGk5F+IdXW/RBOV5XA5iP+KMNfuRvGJ8bZW8xY3CKEHcTFouGO+L7aTMNjhWw+7P9avao4BIUQpBprDznXtPd0wJmqUjiSo4R98Qq6WPSN2EIXhBFZUXRAJ9Pe1xFRIQQ5DeZwpNaLfSSFEEijxZotlfciH1lCOLjGs+HFdTgkxMv08x9BPe61Ol73uQAAAABJRU5ErkJggg==") # with open('', mode='wb') as f: # (content) ocr = (show_ad=False) code = (content) print(code)
7. Case: xwenjie
/
import requests import ddddocr # Obtain the image verification code addressres = (url="/api/auth/captcha/generate") res_dict = () captcha_token = res_dict['data']['captcha_token'] captcha_url = res_dict['data']['src'] # Access and obtain image verification coderes = (captcha_url) # Identify verification codeocr = (show_ad=False) code = () print(code) # Log in to authenticateres = ( url="/api/auth/authenticate", json={ "mobile": "Phone number", "device": "pc", "password": "password", "captcha_token": captcha_token, "captcha": code, "identity": "advertiser" } ) print(()) # {'success': True, 'message': 'Verification is successful', 'data': {'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.-DfjTUcuVuoCjcBqu3djvzJiTeJERaR95co'}, 'status': 200}
This is the article about Python crawler selenium verification - Chinese recognition click + picture verification code case. For more related Python selenium verification content, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!