Python crawler selenium verification Chinese recognition click + picture verification code case (latest recommendation)

1. Get pictures

import re
import time
import ddddocr
import requests
from selenium import webdriver
from  import By
from  import Service
from  import WebDriverWait
from  import ActionChains
service = Service("driver/")
driver = (service=service)
# 1. Open the home page('/adaptive-captcha-demo')
# 2. Click [Text click Verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    ,
    '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
()
# 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    By.CLASS_NAME,
    'geetest_btn_click'
))
()
(5)
# The target image to be identifiedtarget_tag = driver.find_element(
    By.CLASS_NAME,
    'geetest_ques_back'
)
target_tag.screenshot("")
# Identify picturesbg_tag = driver.find_element(
    By.CLASS_NAME,
    'geetest_bg'
)
bg_tag.screenshot("")
(2000)
()

2. Target recognition

Screenshots each character and recognizes it based on ddddocr.

import re
import time
import ddddocr
import requests
from selenium import webdriver
from  import By
from  import Service
from  import WebDriverWait
from  import ActionChains
service = Service("driver/")
driver = (service=service)
# 1. Open the home page('/adaptive-captcha-demo')
# 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    ,
    '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
()
# 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    By.CLASS_NAME,
    'geetest_btn_click'
))
()
# 4. Wait for the verification code to come out(5)
# 5. Identify task picturestarget_word_list = []
parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back')
tag_list = parent.find_elements(By.TAG_NAME, "img")
for tag in tag_list:
    ocr = (show_ad=False)
    word = (tag.screenshot_as_png)
    target_word_list.append(word)
print("Text to be identified:", target_word_list)
(2000)
()

3. Background coordinate recognition

3.1 ddddocr

It can be recognized, but found that the default recognition rate is a bit low. If you want to improve the recognition rate, you can build itPytorchThe environment trains the model, refer to: /sml2h3/dddd_trainer

import re
import time
import ddddocr
import requests
from selenium import webdriver
from  import By
from  import Service
from  import WebDriverWait
from  import ActionChains
from PIL import Image, ImageDraw
from io import BytesIO
service = Service("driver/")
driver = (service=service)
# 1. Open the home page('/adaptive-captcha-demo')
# 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    ,
    '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
()
# 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    By.CLASS_NAME,
    'geetest_btn_click'
))
()
# 4. Wait for the verification code to come out(5)
# 5. Identify task picturestarget_word_list = []
parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back')
tag_list = parent.find_elements(By.TAG_NAME, "img")
for tag in tag_list:
    ocr = (show_ad=False)
    word = (tag.screenshot_as_png)
    target_word_list.append(word)
print("Text to be identified:", target_word_list)
# 6. Background picturebg_tag = driver.find_element(
    By.CLASS_NAME,
    'geetest_bg'
)
content = bg_tag.screenshot_as_png
# 7. Identify all text in the background and get coordinatesocr = (show_ad=False, det=True)
poses = (content) # [(x1, y1, x2, y2), (x1, y1, x2, y2), x1, y1, x2, y2]
# 8. Loop each text in the coordinates and identify itbg_word_dict = {}
img = (BytesIO(content))
for box in poses:
    x1, y1, x2, y2 = box
    # Get the picture of each text according to the coordinates    corp = (box)
    img_byte = BytesIO()
    (img_byte, 'png')
    # Identify text    ocr2 = (show_ad=False)
    word = (img_byte.getvalue())  # Low recognition rate    # Get the coordinates of each word {"Duck":}    bg_word_dict[word] = [int((x1 + x2) / 2), int((y1 + y2) / 2)]
print(bg_word_dict)
(1000)
()

3.2 Coding platform

import base64
import requests
from hashlib import md5
file_bytes = open('', 'rb').read()
res = (
    url='/Upload/',
    data={
        'user': "deng",
        'pass2': md5("password".encode('utf-8')).hexdigest(),
        'codetype': "9501",
        'file_base64': base64.b64encode(file_bytes)
    },
    headers={
        'Connection': 'Keep-Alive',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
    }
)
res_dict = ()
print(res_dict)
# {'err_no': 0, 'err_str': 'OK', 'pic_id': '1234612060701120002', 'pic_str': ',86,73|Flour,111,38|Dish,40,49|Fragrant,198,101', 'md5': 'faac71fc832b2ead01ffb4e813f3be60'}

Combined with screenshots of the extreme case + identification:

import re
import time
import ddddocr
import requests
import base64
import requests
from hashlib import md5
from selenium import webdriver
from  import By
from  import Service
from  import WebDriverWait
from  import ActionChains
from PIL import Image, ImageDraw
from io import BytesIO
service = Service("driver/")
driver = (service=service)
# 1. Open the home page('/adaptive-captcha-demo')
# 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    ,
    '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
()
# 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    By.CLASS_NAME,
    'geetest_btn_click'
))
()
# 4. Wait for the verification code to come out(5)
# 5. Identify task picturestarget_word_list = []
parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back')
tag_list = parent.find_elements(By.TAG_NAME, "img")
for tag in tag_list:
    ocr = (show_ad=False)
    word = (tag.screenshot_as_png)
    target_word_list.append(word)
print("Text to be identified:", target_word_list)
# 6. Background picturebg_tag = driver.find_element(
    By.CLASS_NAME,
    'geetest_bg'
)
content = bg_tag.screenshot_as_png
bg_tag.screenshot("")
# 7. Identify all text in the background and get coordinatesres = (
    url='/Upload/',
    data={
        'user': "deng",
        'pass2': md5("password".encode('utf-8')).hexdigest(),
        'codetype': "9501",
        'file_base64': base64.b64encode(content)
    },
    headers={
        'Connection': 'Keep-Alive',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
    }
)
res_dict = ()
print(res_dict)
# 8. Coordinates of each word {"Duck":(196,85), ...} target_word_list = ["Flower","Duck","word"]bg_word_dict = {}
for item in res_dict["pic_str"].split("|"):
    word, x, y = (",")
    bg_word_dict[word] = (x, y)
print(bg_word_dict)
(1000)
()

4. Coordinate click

Click on the verification code according to the coordinates.

ActionChains(driver).move_to_element_with_offset(Tag Object, xoffset=x, yoffset=y).click().perform()

import re
import time
import ddddocr
import requests
import base64
import requests
from hashlib import md5
from selenium import webdriver
from  import By
from  import Service
from  import WebDriverWait
from  import ActionChains
from PIL import Image, ImageDraw
from io import BytesIO
service = Service("driver/")
driver = (service=service)
# 1. Open the home page('/adaptive-captcha-demo')
# 2. Click [Sliding puzzle verification]tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    ,
    '//*[@]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
()
# 3. Click to start verificationtag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
    By.CLASS_NAME,
    'geetest_btn_click'
))
()
# 4. Wait for the verification code to come out(5)
# 5. Identify task picturestarget_word_list = []
parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back')
tag_list = parent.find_elements(By.TAG_NAME, "img")
for tag in tag_list:
    ocr = (show_ad=False)
    word = (tag.screenshot_as_png)
    target_word_list.append(word)
print("Text to be identified:", target_word_list)
# 6. Background picturebg_tag = driver.find_element(
    By.CLASS_NAME,
    'geetest_bg'
)
content = bg_tag.screenshot_as_png
# bg_tag.screenshot("")
# 7. Identify all text in the background and get coordinatesres = (
    url='/Upload/',
    data={
        'user': "deng",
        'pass2': md5("Own password".encode('utf-8')).hexdigest(),
        'codetype': "9501",
        'file_base64': base64.b64encode(content)
    },
    headers={
        'Connection': 'Keep-Alive',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
    }
)
res_dict = ()
bg_word_dict = {}
for item in res_dict["pic_str"].split("|"):
    word, x, y = (",")
    bg_word_dict[word] = (x, y)
print(bg_word_dict)
# target_word_list = ['Pink', 'Dish', 'Scent']# bg_word_dict = {'Pink': ('10', '10'), 'Dish': ('50', '50'), 'Scent': ('100', '93')}# 8. Clickfor word in target_word_list:
    (2)
    group = bg_word_dict.get(word)
    if not group:
        continue
    x, y = group
    x = int(x) - int(bg_tag.size['width'] / 2)
    y = int(y) - int(bg_tag.size['height'] / 2)
    ActionChains(driver).move_to_element_with_offset(bg_tag, xoffset=x, yoffset=y).click().perform()
(1000)
()

5. Image verification code

In many login, registration, frequent operations, etc., the verification code function is generally added.

If you want to implement certain functions based on the code, you must implement: automatically identify the verification code and then do other functions.

6. Identification

Python-based modulesddddocrIt can realize the identification of image verification codes.

pip3.11 install ddddocr==1.4.9  -i /pypi/simple/
pip3.11 install Pillow==9.5.0

pip install ddddocr==1.4.9  -i /pypi/simple/
pip install Pillow==9.5.0

6.1 Local Identification

import ddddocr
ocr = (show_ad=False)
with open("img/", mode='rb') as f:
    body = ()
code = (body)
print(code)

6.2 Online identification

You can also directly request to obtain the picture and then directly identify it:

import ddddocr
import requests
res = (url="/captcha/create/reg?_t=1701511836608")
ocr = (show_ad=False)
code = ()
print(code)

import ddddocr
import requests
res = (
    url=f"/api/auth/captcha?captcha_token=n5A6VXIsMiI4MTKoco0VigkZbByJbDahhRHGNJmS"
)
ocr = (show_ad=False)
code = ()
print(code)

6.3 base64

The pictures of some platforms exist in base64 encoding and need to be processed and identified.

import base64
import ddddocr
content = base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAGQAAAAoCAYAAAAIeF9DAAAHGElEQVR4Xu2a2VNTZxTAHZ/62of+BX3rdPrUmaq1da3WQWur1mqntrQWLe7UkUoQlEWFqFDZZN8hUBWKQUVpQDCyVUeltVWIIiAEZHWBAEk4zffZe+bmS+6SEEzE/GbOkHvPuXeY85t7vyWZBV48ilnsCS/uxSvEw3hthJydXWITnsiMFyLWfLGcu5jRQuQ2W27dy8ArBOTXvQxmrBBHm+xo/XQhKkTffRu0NSfgt8KvISttGaQlfQyFOWtBXboDbt7Ig6HBdvYSj6C7awDC3kqGg4oC8N0YC6uWhcPCOUHgszQMtvudgsK8Gnj2dNTqGmeErH47Z8rBYlfI6MgAXLqwH5Lj50iGJzL//UDJWP1pBDTfasNrnBEyHdgIef6sF1R5X9o0Xig8Ebb5QrF8QajlLTDoMTIIVkLMZiOcKfrWquGXLyqgo70BDKNDNG8wDFleCTehsS4JivK/4l/uMWxco4T4WDUo30yH+zo9PH0yavn/x+nnuBg1LPhgP0qJjjzjuUJuNGWiiJSEedByt4KffiWxt9bIz65GISveO2CVczcoxGQah+y05SikqT6ZX/fKw4khkfNGEQpZNDeILXUrKKT13mWUkZmyBIxGA7/O5XT2PoL0c9kQcPIX+D5yK2w/vgcis6Oh5qYWJicnac2msM0YrmRw4BkK+cwyA3OEuLh4CApS0GhoaGTTNtTXN2B9fHwCm7YBhdRWR6GQK5rD/BqXU9FQCb4RflYN50d0fgwYxsemTUjJ6ToUEh6iYtOi1NXVY4MTEhLZtA2khquvr69n0zagEP5gfu/uBXruga4aykr8ISv1E0g/tRBUueug+o8I6NH/hTdwFO3tOhsB9iK5NN2lQsbHjZbJSR9kplbC4nkKKsNnySHoejTAlooyOjoKisAXDSYRN2cvW4Lo9XqsCwkJpddKgUJyM3xQSH9fC9RUHbWabbFBnigy63KEEcMIbI3agU0mrynN9WoYfDoIJrOJ/iXH5DwryFnYqS4XP26Kg86OPrZcklPvBIBKVYSNVqvL6Tl7qNVqrCsqKmbTdkEhaUkfYbOv1cbaCLAXNVVR/HtJQl5VXIO3Ru+E3sHHbAmFnCf56RKyb3cmPLjfw5bKgjS/tVWHjY6IiISkd22FmEwmmuPqdDodW2IXFJKaOB8bnZr4IRQXbITWlst01U6ehJGRfnpM1h58KY68vo4VxGKD1doXr0UhSH66hHCx/+dsePJkhL1EkiSLlJDdQdjs5mbbHjQ3N2NeqTyGExUpUAgZJ7gmny32BeOE/ffdhOX8adUmrK2qlD9L2RWzFxvc3adn01Z09XW7RAiH0WiCx73DoKm8DVt8E1DK+tVRTknRaDTY8MzMLDZNz3F5jaaKTQuCQgpy1mCTH3U08Wts6LSs3Lnawty1bFoQMr3lGjxhnGDTVpC8K4XwMZvNELwvF6XEnTjHlkgyPDwMCkUwbTj5S47l5KRAIefL9mCThZ4ODpLnasnsSy6eIoTQ/vAxClm36iibloXQUyD19IiBQhquJTgnJHkRmxbEna8slgnLNJgT4uxq3d44QYJ8FhtfxEAhXZ3XscmdHeIrULLZyNWq8tazaUHcMagL0XKvC4WQ70ucwXYmdZ/OprhjkiM1joBCJifNkJe5kjaZDuoCWyfsoO7I1Ncd0157jI1NwC7/FBQSGJDFlsiGv9YoLi6m6w3uuLy8nC2XxGq395+/S7HRZNqra6m0rC4HLYOgif4lx8X5G7AmOX4uDPTLm18T7C0Mq65fsSwIh/5fGA7R46ksDDd8oYSTx89BnfZfut1O9q1MJjPdfm970EO3Tcj2PH/6e7XmDnsb2bCrcRLcsV7v+FrHSgh5SviDu1RUrAyB1tlX+beQRO7WSUppBn7+LtyPvY0g/EbLicOH5K2gxeDvV3GRmJjElsnCSgiBDNgV5wNtms8P8l3Jn41pluoXix0ixRExcjYXh58/weOflLvYWwjCNlwoyECennzJ8vTLW7CJQXZ9WSGNjeLjsBA2Qjja27RQeTEYcsJW0G2VjJTF9McO2toYwR83OCKFbL+nlWVBwK+BdDq87Zj19jvJc0ICE4PZywUhP3BQ/94E4QdU8MM3J+HzFZH0Bw5L5wfDGp8jsHdnBuRlVdNFoqswGAwQGnoQZZDPY2NjbJksBIUQHGkwhzPX2KPkShkKSTqbwqZnLIJCptLYqVxLaNd3gN/RbSik9paWLZmxuEVIcMohKL92EVo6dNA/PABGk5F+IdXW/RBOV5XA5iP+KMNfuRvGJ8bZW8xY3CKEHcTFouGO+L7aTMNjhWw+7P9avao4BIUQpBprDznXtPd0wJmqUjiSo4R98Qq6WPSN2EIXhBFZUXRAJ9Pe1xFRIQQ5DeZwpNaLfSSFEEijxZotlfciH1lCOLjGs+HFdTgkxMv08x9BPe61Ol73uQAAAABJRU5ErkJggg==")
# with open('', mode='wb') as f:
#     (content)
ocr = (show_ad=False)
code = (content)
print(code)

7. Case: xwenjie

import requests
import ddddocr
# Obtain the image verification code addressres = (url="/api/auth/captcha/generate")
res_dict = ()
captcha_token = res_dict['data']['captcha_token']
captcha_url = res_dict['data']['src']
# Access and obtain image verification coderes = (captcha_url)
# Identify verification codeocr = (show_ad=False)
code = ()
print(code)
# Log in to authenticateres = (
    url="/api/auth/authenticate",
    json={
        "mobile": "Phone number",
        "device": "pc",
        "password": "password",
        "captcha_token": captcha_token,
        "captcha": code,
        "identity": "advertiser"
    }
)
print(())
# {'success': True, 'message': 'Verification is successful', 'data': {'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.-DfjTUcuVuoCjcBqu3djvzJiTeJERaR95co'}, 'status': 200}

This is the article about Python crawler selenium verification - Chinese recognition click + picture verification code case. For more related Python selenium verification content, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!