正方教务系统验证码识别 · Uknow

验证码预处理

下载验证码

实现代码：

import requests


def get_checkcode(i):
    r = requests.get('http:///CheckCode.aspx?')
    picname = str(i) + '.png'
    with open('image\\' + picname, 'wb') as f:
        f.write(r.content)
        print("downloading code %d.png" % i,)


if __name__ == '__main__':
    for i in range(1, 1000):
        get_checkcode(i)

代码分析

利用requests库里的get方法访问生成验证码页面，并保存验证码图片到本地。

代码测试：

Alt text

灰度化、分割

实现代码

import requests
from PIL import Image, ImageTk, ImageFilter


def get_checkcode(i):
    r = requests.get('http:///CheckCode.aspx?')
    picname = str(i) + '.png'
    with open('image\\' + picname, 'wb') as f:
        f.write(r.content)
        print("downloading code %d.png" % i,)


def process_pic(i):
    picname = str(i) + '.png'
    im = Image.open('image\\' + picname)
    im = im.point(lambda i: i > 43, mode='1')
    im.save(picname)
    y_min, y_max = 0, 22  # im.height - 1 # 26
    split_lines = [5, 17, 29, 41, 53]
    ims = [im.crop([u, y_min, v, y_max])
           for u, v in zip(split_lines[:-1], split_lines[1:])]
    return ims


if __name__ == '__main__':
    for i in range(1, 5):
        get_checkcode(i)
        ims = process_pic(i)
        print("saving code %d.png cutted" % i)
        for j in range(0, 4):
            ims[j].save(str(i) + '_' + str(j) + '.png')

代码分析

point()方法通过一个函数或者查询表对图像中的像素点进行处理，im = im.point(lambda i: i > 43, mode='1')
中lambda i遍历整张图片，43是一个阀值，大于43填充为1，小于43填充为0，mode='1'的意思是输出模式为整数型，由此实现灰度化。
y_min, y_max = 0, 22设定验证码图片中最大的y值和最小的y值。
zip(split_lines[:-1], split_lines[1:]) 运行结果为[(5,17),(17,29),(29,41),(41,53)]
im.crop([u, y_min, v, y_max]) ，crop()函数为用来复制一个图片里的一矩形内容，传入参数为矩形的四条边。
所以通过以上的crop()函数实现图片的分割。

代码测试

Alt text

标识

代码分析

利用Python的图形开发界面的库tkinter将验证码图片显示在一个图形化的界面，供人工标识验证码。
resize()函数将爬虫爬取的验证码放大，利于辨别。
display_pic()函数中tk.PhotoImage()方法插入图片。

实现代码

def get_pic_for_display(i):
    picname = str(i) + '.png'
    im = Image.open(picname)
    w, h = im.size
    w_box = 300
    h_box = 200
    im_resized = resize(w, h, w_box, h_box, im)
    tk_image = ImageTk.PhotoImage(im_resized)
    return tk_image

def resize(w, h, w_box, h_box, pil_image):
    f1 = w_box / w
    f2 = h_box / h
    factor = min([f1, f2])
    width = int(w * factor)
    height = int(h * factor)
    return pil_image.resize((width, height), Image.ANTIALIAS)

def display_pic():
    global im
    tmp = get_cnt() + 1
    get_checkcode(tmp)
    im = tk.PhotoImage(file= str(tmp)+'.png')
    im = get_pic_for_display(tmp)
    picLabel['image'] = im
    cntLabel['text'] = '总计: ' + str(tmp-1) + '/1000'

代码测试

Alt text

归类

代码分析

code = var.get()获取图形化界面输入的验证码的值，其类型是一个数组，其中存着四个字符。
for i in range(4):遍历四个字符，判断输入的字符值将他们保存到set\目录下相应的字符。其中path = os.path.join(BASE_DIR, 'sets', code[i])是将BASE_DIR, 'sets', code[i]三个参数组合成一个路径返回给path,后续还用了一个if else来判断这个目录是否存在，不存在即用makedirs创建目录。并创建并更新目录其中的conut.txt中的值，这个值是用来记录当前路径下字符验证码图片的数量。ims[i].save(charname)将验证码分隔后得到的图片保存到当前目录下。

实现代码

def save_imgs():
    tmp = get_cnt() + 1
    ims = process_pic(tmp)
    code = var.get()
    for i in range(4):
        BASE_DIR = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(BASE_DIR, 'sets', code[i])
        if os.path.exists(path):
            filepath = os.path.join(path, 'count.txt')
            with open(filepath, 'r') as f:
                char_cnt = eval(f.readline())
        else:
            os.makedirs(path)
            filepath = os.path.join(path, 'count.txt')
            with open(filepath, 'w') as f:
                f.write('0')
                char_cnt = 0
        charname = os.path.join(path, str(char_cnt + 1) + '.png')
        ims[i].save(charname)
        filepath = os.path.join(path, 'count.txt')
        with open(filepath,'w+') as f:
            f.write(str(char_cnt + 1))
    update_cnt(tmp)

代码测试

Alt text

预处理完整代码

import tkinter as tk
import requests
from PIL import Image, ImageTk, ImageFilter
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os


def get_cnt():
    try:
        with open('count.txt', 'r') as f:
            cnt = f.readline()
            cnt = eval(cnt)
        return cnt
    except:
        with open('count.txt', 'w') as f:
            f.write('0')
        return 0


def update_cnt(cnt):
    with open('count.txt', 'w+') as f:
        f.write(str(cnt))


def get_checkcode(i):
    r = requests.get('http:///CheckCode.aspx?')
    picname = str(i) + '.png'
    with open(picname, 'wb') as f:
        f.write(r.content)


def process_pic(i):
    picname = str(i) + '.png'
    im = Image.open(picname)
    im = im.point(lambda i: i != 43, mode='1')
    y_min, y_max = 0, 22  # im.height - 1 # 26
    split_lines = [5, 17, 29, 41, 53]
    ims = [im.crop([u, y_min, v, y_max])
           for u, v in zip(split_lines[:-1], split_lines[1:])]
    return ims


def get_pic_for_display(i):
    picname = str(i) + '.png'
    im = Image.open(picname)
    w, h = im.size
    w_box = 300
    h_box = 200
    im_resized = resize(w, h, w_box, h_box, im)
    tk_image = ImageTk.PhotoImage(im_resized)
    return tk_image


def resize(w, h, w_box, h_box, pil_image):
    f1 = w_box / w
    f2 = h_box / h
    factor = min([f1, f2])
    width = int(w * factor)
    height = int(h * factor)
    return pil_image.resize((width, height), Image.ANTIALIAS)


def display_pic():
    global im
    tmp = get_cnt() + 1
    get_checkcode(tmp)
    im = tk.PhotoImage(file=str(tmp) + '.png')
    im = get_pic_for_display(tmp)
    picLabel['image'] = im
    cntLabel['text'] = '总计: ' + str(tmp - 1) + '/1000'


def save_imgs():
    tmp = get_cnt() + 1
    ims = process_pic(tmp)
    code = var.get()
    for i in range(4):
        BASE_DIR = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(BASE_DIR, 'sets', code[i])
        if os.path.exists(path):
            filepath = os.path.join(path, 'count.txt')
            with open(filepath, 'r') as f:
                char_cnt = eval(f.readline())
        else:
            os.makedirs(path)
            filepath = os.path.join(path, 'count.txt')
            with open(filepath, 'w') as f:
                f.write('0')
                char_cnt = 0
        charname = os.path.join(path, str(char_cnt + 1) + '.png')
        ims[i].save(charname)
        filepath = os.path.join(path, 'count.txt')
        with open(filepath, 'w+') as f:
            f.write(str(char_cnt + 1))
    update_cnt(tmp)


def submit():
    save_imgs()
    display_pic()
    var.set('')


def init():
    display_pic()


global im
app = tk.Tk()
app.title('Labeller')
app.geometry('500x260')
picLabel = tk.Label(app)
picLabel.pack()
var = tk.StringVar()
textInput = tk.Entry(app, textvariable=var)
textInput.pack(expand='yes', fill='both', padx=100, side='top', pady=10)
submitButton = tk.Button(app, text="提交", width='10', command=submit)
submitButton.pack()
cntLabel = tk.Label(app)
cntLabel.pack(pady=20)
init()
app.mainloop()

Alt text

验证码训练

加载、处理图片数据

def process(data):
    for i in range(0, len(data)):
        if(data[0][i] > 0):
            data[0][i] = 1
    return data


def load_image(path):
    im = Image.open(path)
    data = np.array(im.getdata()).reshape(1, -1)
    data = process(data)
    return data

process()函数是实现一个二值化的过程。
data = np.array(im.getdata()).reshape(1, -1)中的im.getdata()以包含像素值的sequence对象形式返回图像的内容，以包含像素值的sequence对象形式返回图像的内容，该对象的每一个元素对应一个像素点的R、G和B三个值。

def get_count(path, char):
    filepath = os.path.join(path, char, 'count.txt')
    with open(filepath, 'r') as f:
        cnt = eval(f.readline())
    return cnt


def get_label(char):
    global char_list
    for i in range(0, len(char_list)):
        if char_list[i] == char:
            return i


def build_char_set(path, char):
    cnt = get_count(path, char)
    x = np.zeros((cnt, 264))
    y = np.zeros((cnt, 1))
    for i in range(1, cnt + 1):
        filepath = os.path.join(path, char, str(i) + '.png')
        x[i - 1, :] = load_image(filepath)
        y[i - 1] = get_label(char)
    char_set = np.hstack((x, y))
    return char_set
	
def build_sets(path):
    global char_list
    sets = build_char_set(path, char_list[0])
    for i in range(1, len(char_list)):
        char_set = build_char_set(path, char_list[i])
        sets = np.vstack((sets, char_set))
    return sets

cnt = get_count(path, char)通过get_count()函数读取相关字符目录下count.txt文件的数值。
x = np.zeros((cnt, 264)) y = np.zeros((cnt, 1))返回来一个给定形状和类型的用0填充的数组，用np.zeros生成(cnt,264)和(cnt,1)的数组下x,y。
get_label()函数用来读取人工标识获取的char值。load_image()函数用来读取文件目录下的字符。for循环分别填充刚刚生成的y,x数组。
np.hstack((x, y))将填充的y,x数组将水平(按列顺序)把数组给堆叠起来。返回给char_set。
for i in range(1, len(char_list)):遍历所有字符，得到所有字符的水平(按列顺序)把数组给堆叠起来的结果再用sets = np.vstack((sets, char_set))垂直（按照行顺序）的把数组给堆叠起来复制给set。

配置SVM模型

def build_training_sets(sets, percent):
    length = int(len(sets) * percent)
    return sets[0:length, :]


def build_test_sets(sets, percent):
    length = int(len(sets) * (1 - percent))
    return sets[length:len(sets), :]

声明两个函数分别是build_training_sets(sets, percent)和build_test_sets(sets, percent)。这两个函数参数为sets和percent即上部分析得到的sets值和一个百分比值,用于训练集和测试集的建立。

加载测试集、训练集

def train(training_sets):
    x = training_sets[:, 0:264]
    y = training_sets[:, 264].reshape(-1, 1)
    clf = svm.LinearSVC()
    clf.fit(x, y)
    return clf


def recognize(y):
    global char_list
    return char_list[y]


def predict(clf, x):
    return recognize(int(clf.predict(x)[0]))


def accuracy(pred, real):
    cnt = 0
    for i in range(len(pred)):
        if pred[i] == real[i]:
            cnt = cnt + 1
    return cnt / len(pred)


def test(clf, test_sets):
    x = test_sets[:, 0:264]
    y = test_sets[:, 264].reshape(-1, 1)
    length = x.shape[1]
    pred = []
    real = []
    for i in range(0, length):
        pred.append(predict(clf, x[i, :].reshape(1, -1)))
        real.append(recognize(int(y[i])))
    return accuracy(pred, real)

train(training_sets)函数中x和y是用来生成训练集数据的，在后续函数可以得知。
clf = svm.LinearSVC()初始化一个SVM模型。clf.fit(x, y)用x和y作为训练数据拟合模型。
predict(clf, x)是预测load_image()函数用来读取文件目录下的字符的类函数
test(clf, test_sets)函数是用来进行测试集数据处理的函数，函数中加载测试集的sets数据，并计算测试集精度由accuracy()函数返回。

训练、测试并更新

def update_model():
    BASE_DIR = os.path.dirname(os.path.realpath(__file__))
    data_path = os.path.join(BASE_DIR, 'sets')
    model_path = os.path.join(BASE_DIR, 'model', 'svm.model')
    sets = build_sets(data_path)
    for i in range(5):
        np.random.shuffle(sets)
    training_sets = build_training_sets(sets, 0.9)
    test_sets = build_test_sets(sets, 0.1)
    model = train(training_sets)
    res = test(model, test_sets)
    joblib.dump(model, model_path)
    print('Model updated! The accuracy on test sets: ' + str(res))

从update_model()函数分析，这个函数用来更新模型的，显示加载相应的图像数据和svm模型。
for i in range(5): np.random.shuffle(sets)利用一个for循环生成5个用np.random.shuffle(sets)函数打乱顺序的sets值。
利用training_sets = build_training_sets(sets, 0.9)和test_sets = build_test_sets(sets, 0.1),前面提到的两个函数创建训练集和测试集。
model = train(training_sets)加载训练集，训练模型。res = test(model, test_sets)返回测试集的精度。
joblib.dump(model, model_path)使用joblib保存模型到响应目录生成新的SVM模型。

识别验证码

加载、处理验证码

def process_pic(path):
    im = Image.open(path)
    im = im.point(lambda i: i != 43, mode='1')
    y_min, y_max = 0, 22  # im.height - 1 # 26
    split_lines = [5, 17, 29, 41, 53]
    ims = [im.crop([u, y_min, v, y_max])
           for u, v in zip(split_lines[:-1], split_lines[1:])]
    return ims


def process(data):
    for i in range(0, len(data)):
        if(data[0][i] > 0):
            data[0][i] = 1
    return data

process_pic()函数和process()函数在前面labeler模块是出现过的，第一个函数使用来分隔验证码的，把验证码分隔成四部分。第二个函数是用来进行二值化的。

识别验证码

def recognize_checkcode(path):
    model_path = r'./model/svm.model'
    model = joblib.load(model_path)
    char_list = list("0123456789abcdefghijklmnopqrstuvwxy")
    ims = process_pic(path)
    code = []
    for j in range(4):
        data = np.array(ims[j].getdata()).reshape(1, -1)
        data = process(data)
        code.append(predict(model, data))
    return code[0] + code[1] + code[2] + code[3]

recognize_checkcode(path)函数中，首先通过joblib的load方法，加载保存的模型。
process_pic(path)分隔图片，for j in range(4)循环把分隔的四张图像进行二值化过程。
predict(model, data)，把四张图像加入SVM模型测试。将结果利用数组的append方法添加到数组。