scrollunroll/ocr_processor.py at main · comoysha/scrollunroll · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3

import pytesseract
from PIL import Image
import os
import gc

class OCRProcessor:
    def __init__(self):
        """初始化OCR处理器"""
        pass

    def split_image_for_tesseract(self, image, chunk_height=3000, overlap=100):
        """将大图像分割成适合Tesseract处理的小块"""
        chunks = []
        width, height = image.size

        print(f"图像尺寸: {width}x{height}")

        # 如果图像不大，直接返回整个图像
        if height <= chunk_height:
            chunks.append((image, 0, 0, height))
            print("图像尺寸适中，无需分割")
            return chunks

        print(f"开始分割图像...")

        y = 0
        chunk_num = 0
        while y < height:
            # 计算当前块的结束位置
            end_y = min(y + chunk_height, height)

            # 如果剩余高度太小，直接处理到底部
            if height - y < chunk_height + overlap:
                end_y = height

            # 提取图像块
            chunk = image.crop((0, y, width, end_y))
            chunks.append((chunk, chunk_num, y, end_y))

            print(f"分割块 {chunk_num + 1}: {chunk.size} (y: {y}-{end_y})")

            # 移动到下一个位置
            y += chunk_height - overlap
            chunk_num += 1

            # 如果已经到达图像底部，退出循环
            if end_y >= height:
                break

            # 安全检查：防止无限循环
            if chunk_num > 100:
                print("警告: 分割块数量过多，强制停止")
                break

        print(f"总共分割成 {len(chunks)} 个块")
        return chunks

    def ocr_chunk_with_tesseract(self, chunk, chunk_num):
        """使用Tesseract处理单个图像块"""
        try:
            # 检查块的尺寸，如果太小就跳过
            if chunk.height < 20:
                return []

            # 多种配置尝试
            configs = [
                r'--oem 3 --psm 6 -l chi_sim',
                r'--oem 3 --psm 3 -l chi_sim',
                r'--oem 3 --psm 6 -l chi_sim+eng'
            ]

            text = ""
            for config in configs:
                try:
                    text = pytesseract.image_to_string(chunk, config=config)
                    if text.strip():
                        break
                except Exception:
                    continue

            # 清理文本
            lines = []
            for line in text.split('\n'):
                line = line.strip()
                if line and len(line) > 1:  # 过滤单个字符
                    lines.append(line)

            if chunk_num is not None:
                print(f"块 {chunk_num + 1} 识别了 {len(lines)} 行文字")

            return lines

        except Exception as e:
            if chunk_num is not None:
                print(f"块 {chunk_num + 1} OCR失败: {e}")
            return []

    def extract_text_tesseract(self, image_path):
        """使用Tesseract提取图像中的文字"""
        if not os.path.exists(image_path):
            print(f"错误: 图像文件不存在 {image_path}")
            return []

        try:
            # 打开图像
            image = Image.open(image_path)

            # 转换为RGB模式
            if image.mode != 'RGB':
                image = image.convert('RGB')

            # 分割图像
            chunks = self.split_image_for_tesseract(image)

            all_texts = []

            # 处理每个块
            for chunk, chunk_num, start_y, end_y in chunks:
                chunk_texts = self.ocr_chunk_with_tesseract(chunk, chunk_num)
                all_texts.extend(chunk_texts)

                # 清理内存
                if chunk != image:  # 不要删除原图像
                    del chunk
                gc.collect()

            # 去重相邻重复行（由于重叠可能产生）
            final_texts = []
            prev_line = ""
            for line in all_texts:
                if line != prev_line:
                    final_texts.append(line)
                prev_line = line

            return final_texts

        except Exception as e:
            print(f"OCR处理失败: {e}")
            return []

    def extract_text(self, image_path):
        """兼容性方法，调用Tesseract版本"""
        return self.extract_text_tesseract(image_path)

    def save_text(self, texts, output_path):
        """保存识别的文字到文件"""
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                for text in texts:
                    f.write(text + '\n')
            print(f"文字已保存到: {output_path}")
            return True
        except Exception as e:
            print(f"保存文字失败: {e}")
            return False