验证码预处理

下载验证码

实现代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import requests


def get_checkcode(i):
r = requests.get('http:///CheckCode.aspx?')
picname = str(i) + '.png'
with open('image\\' + picname, 'wb') as f:
f.write(r.content)
print("downloading code %d.png" % i,)


if __name__ == '__main__':
for i in range(1, 1000):
get_checkcode(i)

代码分析

  • 利用requests库里的get方法访问生成验证码页面,并保存验证码图片到本地。

代码测试:

Alt text

灰度化、分割

实现代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from PIL import Image, ImageTk, ImageFilter


def get_checkcode(i):
r = requests.get('http:///CheckCode.aspx?')
picname = str(i) + '.png'
with open('image\\' + picname, 'wb') as f:
f.write(r.content)
print("downloading code %d.png" % i,)


def process_pic(i):
picname = str(i) + '.png'
im = Image.open('image\\' + picname)
im = im.point(lambda i: i > 43, mode='1')
im.save(picname)
y_min, y_max = 0, 22 # im.height - 1 # 26
split_lines = [5, 17, 29, 41, 53]
ims = [im.crop([u, y_min, v, y_max])
for u, v in zip(split_lines[:-1], split_lines[1:])]
return ims


if __name__ == '__main__':
for i in range(1, 5):
get_checkcode(i)
ims = process_pic(i)
print("saving code %d.png cutted" % i)
for j in range(0, 4):
ims[j].save(str(i) + '_' + str(j) + '.png')

代码分析

  • point()方法通过一个函数或者查询表对图像中的像素点进行处理,im = im.point(lambda i: i > 43, mode='1')
    lambda i遍历整张图片,43是一个阀值,大于43填充为1,小于43填充为0mode='1'的意思是输出模式为整数型,由此实现灰度化。

  • y_min, y_max = 0, 22设定验证码图片中最大的y值和最小的y值。

  • zip(split_lines[:-1], split_lines[1:]) 运行结果为[(5,17),(17,29),(29,41),(41,53)]

  • im.crop([u, y_min, v, y_max])crop()函数为用来复制一个图片里的一矩形内容,传入参数为矩形的四条边。

  • 所以通过以上的crop()函数实现图片的分割。

代码测试

Alt text

标识

代码分析

  • 利用Python的图形开发界面的库tkinter将验证码图片显示在一个图形化的界面,供人工标识验证码。
  • resize()函数将爬虫爬取的验证码放大,利于辨别。
  • display_pic()函数中tk.PhotoImage()方法插入图片。

实现代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def get_pic_for_display(i):
picname = str(i) + '.png'
im = Image.open(picname)
w, h = im.size
w_box = 300
h_box = 200
im_resized = resize(w, h, w_box, h_box, im)
tk_image = ImageTk.PhotoImage(im_resized)
return tk_image

def resize(w, h, w_box, h_box, pil_image):
f1 = w_box / w
f2 = h_box / h
factor = min([f1, f2])
width = int(w * factor)
height = int(h * factor)
return pil_image.resize((width, height), Image.ANTIALIAS)

def display_pic():
global im
tmp = get_cnt() + 1
get_checkcode(tmp)
im = tk.PhotoImage(file= str(tmp)+'.png')
im = get_pic_for_display(tmp)
picLabel['image'] = im
cntLabel['text'] = '总计: ' + str(tmp-1) + '/1000'

代码测试

Alt text

归类

代码分析

  • code = var.get()获取图形化界面输入的验证码的值,其类型是一个数组,其中存着四个字符。
  • for i in range(4):遍历四个字符,判断输入的字符值将他们保存到set\目录下相应的字符。其中path = os.path.join(BASE_DIR, 'sets', code[i])是将BASE_DIR, 'sets', code[i]三个参数组合成一个路径返回给path,后续还用了一个if else来判断这个目录是否存在,不存在即用makedirs创建目录。并创建并更新目录其中的conut.txt中的值,这个值是用来记录当前路径下字符验证码图片的数量。ims[i].save(charname)将验证码分隔后得到的图片保存到当前目录下。

实现代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def save_imgs():
tmp = get_cnt() + 1
ims = process_pic(tmp)
code = var.get()
for i in range(4):
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(BASE_DIR, 'sets', code[i])
if os.path.exists(path):
filepath = os.path.join(path, 'count.txt')
with open(filepath, 'r') as f:
char_cnt = eval(f.readline())
else:
os.makedirs(path)
filepath = os.path.join(path, 'count.txt')
with open(filepath, 'w') as f:
f.write('0')
char_cnt = 0
charname = os.path.join(path, str(char_cnt + 1) + '.png')
ims[i].save(charname)
filepath = os.path.join(path, 'count.txt')
with open(filepath,'w+') as f:
f.write(str(char_cnt + 1))
update_cnt(tmp)

代码测试

Alt text

预处理完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import tkinter as tk
import requests
from PIL import Image, ImageTk, ImageFilter
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os


def get_cnt():
try:
with open('count.txt', 'r') as f:
cnt = f.readline()
cnt = eval(cnt)
return cnt
except:
with open('count.txt', 'w') as f:
f.write('0')
return 0


def update_cnt(cnt):
with open('count.txt', 'w+') as f:
f.write(str(cnt))


def get_checkcode(i):
r = requests.get('http:///CheckCode.aspx?')
picname = str(i) + '.png'
with open(picname, 'wb') as f:
f.write(r.content)


def process_pic(i):
picname = str(i) + '.png'
im = Image.open(picname)
im = im.point(lambda i: i != 43, mode='1')
y_min, y_max = 0, 22 # im.height - 1 # 26
split_lines = [5, 17, 29, 41, 53]
ims = [im.crop([u, y_min, v, y_max])
for u, v in zip(split_lines[:-1], split_lines[1:])]
return ims


def get_pic_for_display(i):
picname = str(i) + '.png'
im = Image.open(picname)
w, h = im.size
w_box = 300
h_box = 200
im_resized = resize(w, h, w_box, h_box, im)
tk_image = ImageTk.PhotoImage(im_resized)
return tk_image


def resize(w, h, w_box, h_box, pil_image):
f1 = w_box / w
f2 = h_box / h
factor = min([f1, f2])
width = int(w * factor)
height = int(h * factor)
return pil_image.resize((width, height), Image.ANTIALIAS)


def display_pic():
global im
tmp = get_cnt() + 1
get_checkcode(tmp)
im = tk.PhotoImage(file=str(tmp) + '.png')
im = get_pic_for_display(tmp)
picLabel['image'] = im
cntLabel['text'] = '总计: ' + str(tmp - 1) + '/1000'


def save_imgs():
tmp = get_cnt() + 1
ims = process_pic(tmp)
code = var.get()
for i in range(4):
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(BASE_DIR, 'sets', code[i])
if os.path.exists(path):
filepath = os.path.join(path, 'count.txt')
with open(filepath, 'r') as f:
char_cnt = eval(f.readline())
else:
os.makedirs(path)
filepath = os.path.join(path, 'count.txt')
with open(filepath, 'w') as f:
f.write('0')
char_cnt = 0
charname = os.path.join(path, str(char_cnt + 1) + '.png')
ims[i].save(charname)
filepath = os.path.join(path, 'count.txt')
with open(filepath, 'w+') as f:
f.write(str(char_cnt + 1))
update_cnt(tmp)


def submit():
save_imgs()
display_pic()
var.set('')


def init():
display_pic()


global im
app = tk.Tk()
app.title('Labeller')
app.geometry('500x260')
picLabel = tk.Label(app)
picLabel.pack()
var = tk.StringVar()
textInput = tk.Entry(app, textvariable=var)
textInput.pack(expand='yes', fill='both', padx=100, side='top', pady=10)
submitButton = tk.Button(app, text="提交", width='10', command=submit)
submitButton.pack()
cntLabel = tk.Label(app)
cntLabel.pack(pady=20)
init()
app.mainloop()

Alt text

验证码训练

###

1
2
3
4
5
6
7
8
9
10
11
12
def process(data):
for i in range(0, len(data)):
if(data[0][i] > 0):
data[0][i] = 1
return data


def load_image(path):
im = Image.open(path)
data = np.array(im.getdata()).reshape(1, -1)
data = process(data)
return data
  • data = np.array(im.getdata()).reshape(1, -1)中的im.getdata()以包含像素值的sequence对象形式返回图像的内容,以包含像素值的sequence对象形式返回图像的内容,该对象的每一个元素对应一个像素点的R、G和B三个值。