ISBN数字识别


ISBN识别

学校三级项目需要批量识别ISBN中的数字

实现的大致思路如下:

对原始图片按尺寸自动调整大小,高斯滤波去噪,灰度化,二值化,边缘检测后闭操作,查找最大轮廓,获取最小外接矩形及旋转角度,旋转摆正图片,水平投影,提取字符区域,用pytesseract识别字符

项目代码存放在三个文件中

1.工具类ocr_tool.py

import re
from difflib import SequenceMatcher


# 提取字符串中的数字
def obtain_digit(data):
    s = re.findall(r"\d+", data)
    return ''.join(s)


# 统计正确识别的数字个数
def crct_digit_cnt(crct_isbn, recog_isbn):
    return SequenceMatcher(None, crct_isbn, recog_isbn).find_longest_match(0, len(crct_isbn), 0, len(recog_isbn)).size

2.图片预处理相关函数img_process_tool.py

from math import fabs, sin, radians, cos

import cv2 as cv
import numpy as np


def img_show_wait(img, window_name, duration=100):
    """
    显示图片
    :param img:输入图片
    :param window_name: 显示图片的窗口名称
    :param duration: 显示图片的时长,默认等待键入任意按键,不自动关闭窗口
    """
    cv.imshow(window_name, img)
    cv.waitKey(duration)


def get_projection_list(binary_img, direction='horizontal'):
    """
    获取指定方向的投影
    :param binary_img: 输入的二值图
    :param direction: 投影方向
    :return: 投影方向上的像素统计图
    """
    h, w = binary_img.shape[:2]
    row_list = [0] * h
    col_list = [0] * w
    for row in range(h):
        for col in range(w):
            if binary_img[row, col] == 255:  # 统计白色像素点个数
                row_list[row] = row_list[row] + 1
                col_list[col] = col_list[col] + 1
    if direction == 'horizontal':
        return row_list
    else:
        return col_list


def draw_projection(data_list, rows, cols, direction='horizontal'):
    """
    绘制指定方向上的投影图并返回
    :param data_list: 用于绘制投影的数据
    :param rows: 原始图像的行数
    :param cols: 原始图像的列数
    :param direction: 指定水平或者垂直方向投影
    :return: 投影图像
    """
    img_proj = np.ones(shape=(rows, cols), dtype=np.uint8) * 255
    row_max = np.max(data_list)
    if direction == 'horizontal':
        # 绘制水平投影图
        weight = cols / row_max
        for row in range(rows):
            pt1 = (0, row)
            pt2 = (int(weight * data_list[row]), row)
            cv.line(img_proj, pt1, pt2, (0,), 1)
        img_show_wait(img_proj, 'horizontal projection')
    else:
        # 绘制垂直投影图
        weight = rows / row_max
        for col in range(cols):
            pt1 = (col, rows - 1)
            pt2 = (col, rows - 1 - int(weight * data_list[col]))
            cv.line(img_proj, pt1, pt2, (0,), 1)
        img_show_wait(img_proj, 'vertical projection')
    return img_proj


def split_projection_list(proj_list: list, min_val=0):
    """
    将投影得到的像素统计区间分割出像素集中区域,返回ROI区域的坐标区间
    :param proj_list: 投影统计数据
    :param min_val: 用于划定区间的一个阈值
    :return: ROI区域的坐标区间
    """
    start = 0
    end = None
    split_list = []
    for idx, value in enumerate(proj_list):
        if value > min_val:
            end = idx
        else:
            if end is not None:
                split_list.append((start, end))
                end = None
            start = idx
    return split_list


def img_rotate(img, degree):
    """
    对图片进行旋转
    :param img:输入图片
    :param degree: 旋转角度
    :return:
    """
    height, width = img.shape[:2]
    heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree))))  # 扩充画布
    widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree))))  # 扩充画布
    matRotation = cv.getRotationMatrix2D((width // 2, height // 2), degree, 1)  # 获取旋转矩阵
    matRotation[0, 2] += (widthNew - width) // 2  # 旋转后平移
    matRotation[1, 2] += (heightNew - height) // 2  # 旋转后平移
    imgRotation = cv.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255))  # 获取旋转后的图片
    return imgRotation, matRotation


def draw_box(img, box):
    """
    在指定图片上画矩形框
    :param img: 输入图片
    :param box: 矩形框的坐标,一个长度为8的坐标序列
    :return:
    """
    cv.line(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 3)
    cv.line(img, (box[2], box[3]), (box[4], box[5]), (0, 255, 0), 3)
    cv.line(img, (box[0], box[1]), (box[6], box[7]), (0, 255, 0), 3)
    cv.line(img, (box[4], box[5]), (box[6], box[7]), (0, 255, 0), 3)
    return img


def img_resize(img_original):
    """
    根据给定图片的尺寸自动调整为相应的大小
    :param img_original: 输入图片
    :return:
    """
    rows, cols = img_original.shape[:2]
    if rows > 1300:
        img_original = cv.resize(img_original, None, fx=0.25, fy=0.25, interpolation=cv.INTER_CUBIC)  # 调整大小
    elif 750 < rows <= 1300:
        img_original = cv.resize(img_original, None, fx=0.5, fy=0.5, interpolation=cv.INTER_CUBIC)  # 调整大小
    elif 500 < rows <= 750:
        img_original = cv.resize(img_original, None, fx=0.75, fy=0.75, interpolation=cv.INTER_CUBIC)  # 调整大小
    return img_original


def adaptive_threshold(gray, block_size=5, c=10, inv=False):
    """
    对给定图片进行二值化处理
    :param gray: 输入的灰度图
    :param block_size: 卷积核大小
    :param c:
    :param inv: 是否反转,默认情况下,不反转,字符区域显示为白色,不相关区域显示为黑色
    :return: 返回二值化后的图像
    """
    if not inv:
        thresholdType = cv.THRESH_BINARY
    else:
        thresholdType = cv.THRESH_BINARY_INV
    # 自适应阈值化能够根据图像不同区域亮度分布,改变阈值
    binary_img = cv.adaptiveThreshold(
        gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, thresholdType, block_size, c)
    return binary_img


def img_preprocess(img, kernel=None):
    """
    对图片进行预处理,包括高斯滤波去噪,转为灰度图,自适应二值化处理,闭运算处理
    :param img: 输入图片
    :param kernel: 卷积核
    :return: 预处理后得到的图像
    """
    img_blur = cv.GaussianBlur(img, (3, 3), 0)  # 高斯滤波进行去噪
    img_gray = cv.cvtColor(img_blur, cv.COLOR_BGR2GRAY)  # 转为灰度图
    ret, img_bin = cv.threshold(img_gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)  # 自适应二值化
    img_canny = cv.Canny(img_bin, 50, 50)  # 边缘检测
    if kernel is None:
        kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5), (-1, -1))
    img_close = cv.morphologyEx(img_canny, cv.MORPH_CLOSE, kernel)  # 先膨胀后腐蚀,减少连通区域(闭运算)
    img_show_wait(img_close, 'original img close')
    return img_close


if __name__ == '__main__':
    # 测试图片切割序列
    path_to_img = r'D:\projects_python\workingon\isbnocr\pageocr\xxx.png'
    img_original = cv.imread(path_to_img)
    img_pre = img_preprocess(img_original)
    horizontal_projection_list = get_projection_list(img_pre, 'horizontal')
    rows, cols = img_pre.shape[:2]
    draw_projection(horizontal_projection_list, rows, cols, 'horizontal')

3.核心代码isbnocr.py

import os
from os import listdir

import pytesseract

from img_process_tool import *
from ocr_tool import *


def img_isbn_area(img_original):
    """
    对原始图片倾斜摆正后提取出原始图片中的字符区域
    :param img_original: 输入图片
    :return: 提取出的字符区域
    """
    kernel = cv.getStructuringElement(cv.MORPH_RECT, (100, 5), (-1, -1))
    img_pre = img_preprocess(img_original, kernel)
    # 查找最大轮廓
    contours, hierarchy, = cv.findContours(img_pre, 1, 2)
    length = len(contours)
    index = 0  # 存放最大轮廓的索引
    max_area = cv.contourArea(contours[index])  # 存放最大轮廓的面积
    for i in range(length):
        cnt = contours[i]
        area = cv.contourArea(cnt)
        if area > max_area:
            index = i
            max_area = area
    # 利用最大轮廓计算倾斜角度,将图片摆正
    cnt = contours[index]
    min_area_rect = cv.minAreaRect(cnt)  # 获取最小外接矩形,返回一个rect,ndarray类型
    angle = min_area_rect[2]  # 获取旋转角度
    if angle > 80:
        angle = min_area_rect[2] - 90  # 调整旋转角度
    img_rotated, mat_rotation = img_rotate(img_original, angle)  # 开始旋转
    img_rotated_pre = img_preprocess(img_rotated, kernel)
    # 水平投影
    proj_list = get_projection_list(img_rotated_pre)
    split_list = split_projection_list(proj_list, 0)
    img_h, img_w = img_rotated_pre.shape[:2]
    x, y, w, h = 0, 0, img_w, img_h
    for start, end in split_list:
        if end - start > img_h * 0.5:
            continue
        x, y, w, h = 0, start, img_w, end - start
        roi = img_rotated[y:y + h, x:x + w]
        img_show_wait(roi, 'roi')
        proj_list = get_projection_list(img_rotated_pre, 'vertical')
        split_vertical_list = split_projection_list(proj_list, 0)
        if len(split_vertical_list) < 12:
            continue
    return img_rotated[y:y + h, x:x + w]


def split_digits(img_text):
    """
    对提取出的字符区域进行垂直投影,切割出单个字符区域
    :param img_text: 输入图像
    :return: 单个字符区域
    """
    isbn_gray = cv.cvtColor(img_text, cv.COLOR_BGR2GRAY)
    ret, isbn_bin = cv.threshold(isbn_gray, 0, 255, cv.THRESH_OTSU + cv.THRESH_BINARY_INV)
    img_show_wait(isbn_bin, 'isbn area bin')
    ver_proj_list = get_projection_list(isbn_bin, 'vertical')
    rows, cols = img_text.shape[:2]
    draw_projection(ver_proj_list, rows, cols, 'vertical')
    digit_col_list = split_projection_list(ver_proj_list)
    digits = []
    for i in range(len(digit_col_list)):
        digit_col = digit_col_list[i]
        digits.append(img_text[:, digit_col[0]:digit_col[1]])
    return digits


def digit_recog(path_to_image):
    """
    识别单张图片中的ISBN编号
    :param path_to_image: 图片绝对路径
    :return:
    """
    img_original = cv.imread(path_to_image)
    img_resized = img_resize(img_original)
    isbn_area = img_isbn_area(img_resized)
    recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
    print(f'【图片{os.path.basename(path_to_image)}】的识别结果为:{recog_isbn}')
    if isbn_area is not None:
        digits = split_digits(isbn_area)
        if digits is not None:
            for i in range(len(digits)):
                img_show_wait(digits[i], 'digit')


def digit_recog_batch(path_to_images):
    """
    批量识别图片
    :param path_to_images: 图片存放的文件夹绝对路径
    :return:
    """
    img_cnt = 0  # 图片数量
    digit_cnt = 0  # 数字数量
    digit_recognized = 0  # 正确识别数字
    isbn_recognized = 0  # 正确识别完整ISBN
    for file in listdir(path_to_images):
        img_abs_path = os.path.join(path_to_images, file)  # 图像的绝对路径
        img_basename = os.path.basename(img_abs_path)  # 图片的文件名
        original_image = cv.imread(img_abs_path, cv.IMREAD_COLOR)
        isbn_area = img_isbn_area(img_resize(original_image))  # 获取图片中的字符区域
        if isbn_area is not None:
            recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
            print(f'【图片{img_basename}】的识别结果为:{recog_isbn}')
            if recog_isbn:
                img_cnt += 1
                digit_cnt += len(obtain_digit(img_basename))
                digit_recognized += crct_digit_cnt(obtain_digit(img_basename), recog_isbn)
                isbn_recognized += 1 if recog_isbn.find(obtain_digit(file)) != -1 else 0

    print("正确识别的ISBN个数:" + str(isbn_recognized) + "/" + str(img_cnt))
    print("正确识别的数字个数:" + str(digit_recognized) + "/" + str(digit_cnt))
    print("识别正确率:" + str(isbn_recognized / img_cnt))
    print("识别准确率:" + str(digit_recognized / digit_cnt))


# todo:字符识别可以用神经网络或者模板匹配
if __name__ == "__main__":
    pytesseract.pytesseract.tesseract_cmd = r"D:\software\Tesseract-OCR\tesseract.exe"
    path_to_image = r'D:\projects_python\workingon\isbnocr\isbn_recognition\ISBN 978-7-5099-1125-9.png'
    path_to_images = r'D:\projects_python\workingon\isbnocr\isbn_recognition\images'
    # digit_recog(path_to_image)  # 识别单张图片
    digit_recog_batch(path_to_images)  # 批量识别图片中的ISBN编号

这个项目,坐下来感觉比较有参考价值的还是图片预处理的思路和相关实现,至于字符识别,由于时间关系没来得及写,就用了比较粗陋的方法实现,后面有时间改成模板匹配或者神经网络识别字符。

图片的源代码和数据集都同步到gitee上了,地址在这里isbn-ocr: 计算机视觉课程设计 识别ISBN中的数字 (gitee.com)