-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr_processor.py
More file actions
157 lines (124 loc) · 5.05 KB
/
ocr_processor.py
File metadata and controls
157 lines (124 loc) · 5.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
import pytesseract
from PIL import Image
import os
import gc
class OCRProcessor:
def __init__(self):
"""初始化OCR处理器"""
pass
def split_image_for_tesseract(self, image, chunk_height=3000, overlap=100):
"""将大图像分割成适合Tesseract处理的小块"""
chunks = []
width, height = image.size
print(f"图像尺寸: {width}x{height}")
# 如果图像不大,直接返回整个图像
if height <= chunk_height:
chunks.append((image, 0, 0, height))
print("图像尺寸适中,无需分割")
return chunks
print(f"开始分割图像...")
y = 0
chunk_num = 0
while y < height:
# 计算当前块的结束位置
end_y = min(y + chunk_height, height)
# 如果剩余高度太小,直接处理到底部
if height - y < chunk_height + overlap:
end_y = height
# 提取图像块
chunk = image.crop((0, y, width, end_y))
chunks.append((chunk, chunk_num, y, end_y))
print(f"分割块 {chunk_num + 1}: {chunk.size} (y: {y}-{end_y})")
# 移动到下一个位置
y += chunk_height - overlap
chunk_num += 1
# 如果已经到达图像底部,退出循环
if end_y >= height:
break
# 安全检查:防止无限循环
if chunk_num > 100:
print("警告: 分割块数量过多,强制停止")
break
print(f"总共分割成 {len(chunks)} 个块")
return chunks
def ocr_chunk_with_tesseract(self, chunk, chunk_num):
"""使用Tesseract处理单个图像块"""
try:
# 检查块的尺寸,如果太小就跳过
if chunk.height < 20:
return []
# 多种配置尝试
configs = [
r'--oem 3 --psm 6 -l chi_sim',
r'--oem 3 --psm 3 -l chi_sim',
r'--oem 3 --psm 6 -l chi_sim+eng'
]
text = ""
for config in configs:
try:
text = pytesseract.image_to_string(chunk, config=config)
if text.strip():
break
except Exception:
continue
# 清理文本
lines = []
for line in text.split('\n'):
line = line.strip()
if line and len(line) > 1: # 过滤单个字符
lines.append(line)
if chunk_num is not None:
print(f"块 {chunk_num + 1} 识别了 {len(lines)} 行文字")
return lines
except Exception as e:
if chunk_num is not None:
print(f"块 {chunk_num + 1} OCR失败: {e}")
return []
def extract_text_tesseract(self, image_path):
"""使用Tesseract提取图像中的文字"""
if not os.path.exists(image_path):
print(f"错误: 图像文件不存在 {image_path}")
return []
try:
# 打开图像
image = Image.open(image_path)
# 转换为RGB模式
if image.mode != 'RGB':
image = image.convert('RGB')
# 分割图像
chunks = self.split_image_for_tesseract(image)
all_texts = []
# 处理每个块
for chunk, chunk_num, start_y, end_y in chunks:
chunk_texts = self.ocr_chunk_with_tesseract(chunk, chunk_num)
all_texts.extend(chunk_texts)
# 清理内存
if chunk != image: # 不要删除原图像
del chunk
gc.collect()
# 去重相邻重复行(由于重叠可能产生)
final_texts = []
prev_line = ""
for line in all_texts:
if line != prev_line:
final_texts.append(line)
prev_line = line
return final_texts
except Exception as e:
print(f"OCR处理失败: {e}")
return []
def extract_text(self, image_path):
"""兼容性方法,调用Tesseract版本"""
return self.extract_text_tesseract(image_path)
def save_text(self, texts, output_path):
"""保存识别的文字到文件"""
try:
with open(output_path, 'w', encoding='utf-8') as f:
for text in texts:
f.write(text + '\n')
print(f"文字已保存到: {output_path}")
return True
except Exception as e:
print(f"保存文字失败: {e}")
return False