|
1
|
"""Text extraction module for frames and diagrams.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
from pathlib import Path |
|
5
|
from typing import Dict, List, Optional, Tuple, Union |
|
6
|
|
|
7
|
import cv2 |
|
8
|
import numpy as np |
|
9
|
|
|
10
|
logger = logging.getLogger(__name__) |
|
11
|
|
|
12
|
|
|
13
|
class TextExtractor: |
|
14
|
"""Extract text from images, frames, and diagrams.""" |
|
15
|
|
|
16
|
def __init__(self, tesseract_path: Optional[str] = None): |
|
17
|
""" |
|
18
|
Initialize text extractor. |
|
19
|
|
|
20
|
Parameters |
|
21
|
---------- |
|
22
|
tesseract_path : str, optional |
|
23
|
Path to tesseract executable for local OCR |
|
24
|
""" |
|
25
|
self.tesseract_path = tesseract_path |
|
26
|
|
|
27
|
# Check if we're using tesseract locally |
|
28
|
self.use_local_ocr = False |
|
29
|
if tesseract_path: |
|
30
|
try: |
|
31
|
import pytesseract |
|
32
|
|
|
33
|
pytesseract.pytesseract.tesseract_cmd = tesseract_path |
|
34
|
self.use_local_ocr = True |
|
35
|
except ImportError: |
|
36
|
logger.warning("pytesseract not installed, local OCR unavailable") |
|
37
|
|
|
38
|
def preprocess_image(self, image: np.ndarray) -> np.ndarray: |
|
39
|
""" |
|
40
|
Preprocess image for better text extraction. |
|
41
|
|
|
42
|
Parameters |
|
43
|
---------- |
|
44
|
image : np.ndarray |
|
45
|
Input image |
|
46
|
|
|
47
|
Returns |
|
48
|
------- |
|
49
|
np.ndarray |
|
50
|
Preprocessed image |
|
51
|
""" |
|
52
|
# Convert to grayscale if not already |
|
53
|
if len(image.shape) == 3: |
|
54
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
55
|
else: |
|
56
|
gray = image |
|
57
|
|
|
58
|
# Apply adaptive thresholding |
|
59
|
thresh = cv2.adaptiveThreshold( |
|
60
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2 |
|
61
|
) |
|
62
|
|
|
63
|
# Noise removal |
|
64
|
kernel = np.ones((1, 1), np.uint8) |
|
65
|
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) |
|
66
|
|
|
67
|
# Invert back |
|
68
|
result = cv2.bitwise_not(opening) |
|
69
|
|
|
70
|
return result |
|
71
|
|
|
72
|
def extract_text_local(self, image: np.ndarray) -> str: |
|
73
|
""" |
|
74
|
Extract text from image using local OCR (Tesseract). |
|
75
|
|
|
76
|
Parameters |
|
77
|
---------- |
|
78
|
image : np.ndarray |
|
79
|
Input image |
|
80
|
|
|
81
|
Returns |
|
82
|
------- |
|
83
|
str |
|
84
|
Extracted text |
|
85
|
""" |
|
86
|
if not self.use_local_ocr: |
|
87
|
raise RuntimeError("Local OCR not configured") |
|
88
|
|
|
89
|
import pytesseract |
|
90
|
|
|
91
|
# Preprocess image |
|
92
|
processed = self.preprocess_image(image) |
|
93
|
|
|
94
|
# Extract text |
|
95
|
text = pytesseract.image_to_string(processed) |
|
96
|
|
|
97
|
return text |
|
98
|
|
|
99
|
def detect_text_regions(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]: |
|
100
|
""" |
|
101
|
Detect potential text regions in image. |
|
102
|
|
|
103
|
Parameters |
|
104
|
---------- |
|
105
|
image : np.ndarray |
|
106
|
Input image |
|
107
|
|
|
108
|
Returns |
|
109
|
------- |
|
110
|
list |
|
111
|
List of bounding boxes for text regions (x, y, w, h) |
|
112
|
""" |
|
113
|
# Convert to grayscale |
|
114
|
if len(image.shape) == 3: |
|
115
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
116
|
else: |
|
117
|
gray = image |
|
118
|
|
|
119
|
# Apply MSER (Maximally Stable Extremal Regions) |
|
120
|
mser = cv2.MSER_create() |
|
121
|
regions, _ = mser.detectRegions(gray) |
|
122
|
|
|
123
|
# Convert regions to bounding boxes |
|
124
|
bboxes = [] |
|
125
|
for region in regions: |
|
126
|
x, y, w, h = cv2.boundingRect(region.reshape(-1, 1, 2)) |
|
127
|
|
|
128
|
# Apply filtering criteria for text-like regions |
|
129
|
aspect_ratio = w / float(h) |
|
130
|
if 0.1 < aspect_ratio < 10 and h > 5 and w > 5: |
|
131
|
bboxes.append((x, y, w, h)) |
|
132
|
|
|
133
|
# Merge overlapping boxes |
|
134
|
merged_bboxes = self._merge_overlapping_boxes(bboxes) |
|
135
|
|
|
136
|
logger.debug(f"Detected {len(merged_bboxes)} text regions") |
|
137
|
return merged_bboxes |
|
138
|
|
|
139
|
def _merge_overlapping_boxes( |
|
140
|
self, boxes: List[Tuple[int, int, int, int]] |
|
141
|
) -> List[Tuple[int, int, int, int]]: |
|
142
|
""" |
|
143
|
Merge overlapping bounding boxes. |
|
144
|
|
|
145
|
Parameters |
|
146
|
---------- |
|
147
|
boxes : list |
|
148
|
List of bounding boxes (x, y, w, h) |
|
149
|
|
|
150
|
Returns |
|
151
|
------- |
|
152
|
list |
|
153
|
Merged bounding boxes |
|
154
|
""" |
|
155
|
if not boxes: |
|
156
|
return [] |
|
157
|
|
|
158
|
# Sort boxes by x coordinate |
|
159
|
sorted_boxes = sorted(boxes, key=lambda b: b[0]) |
|
160
|
|
|
161
|
merged = [] |
|
162
|
current = list(sorted_boxes[0]) |
|
163
|
|
|
164
|
for box in sorted_boxes[1:]: |
|
165
|
# Check if current box overlaps with the next one |
|
166
|
if ( |
|
167
|
current[0] <= box[0] + box[2] |
|
168
|
and box[0] <= current[0] + current[2] |
|
169
|
and current[1] <= box[1] + box[3] |
|
170
|
and box[1] <= current[1] + current[3] |
|
171
|
): |
|
172
|
# Calculate merged box |
|
173
|
x1 = min(current[0], box[0]) |
|
174
|
y1 = min(current[1], box[1]) |
|
175
|
x2 = max(current[0] + current[2], box[0] + box[2]) |
|
176
|
y2 = max(current[1] + current[3], box[1] + box[3]) |
|
177
|
|
|
178
|
# Update current box |
|
179
|
current = [x1, y1, x2 - x1, y2 - y1] |
|
180
|
else: |
|
181
|
# Add current box to merged list and update current |
|
182
|
merged.append(tuple(current)) |
|
183
|
current = list(box) |
|
184
|
|
|
185
|
# Add the last box |
|
186
|
merged.append(tuple(current)) |
|
187
|
|
|
188
|
return merged |
|
189
|
|
|
190
|
def extract_text_from_regions( |
|
191
|
self, image: np.ndarray, regions: List[Tuple[int, int, int, int]] |
|
192
|
) -> Dict[Tuple[int, int, int, int], str]: |
|
193
|
""" |
|
194
|
Extract text from specified regions in image. |
|
195
|
|
|
196
|
Parameters |
|
197
|
---------- |
|
198
|
image : np.ndarray |
|
199
|
Input image |
|
200
|
regions : list |
|
201
|
List of regions as (x, y, w, h) |
|
202
|
|
|
203
|
Returns |
|
204
|
------- |
|
205
|
dict |
|
206
|
Dictionary of {region: text} |
|
207
|
""" |
|
208
|
results = {} |
|
209
|
|
|
210
|
for region in regions: |
|
211
|
x, y, w, h = region |
|
212
|
|
|
213
|
# Extract region |
|
214
|
roi = image[y : y + h, x : x + w] |
|
215
|
|
|
216
|
# Skip empty regions |
|
217
|
if roi.size == 0: |
|
218
|
continue |
|
219
|
|
|
220
|
# Extract text |
|
221
|
if self.use_local_ocr: |
|
222
|
text = self.extract_text_local(roi) |
|
223
|
else: |
|
224
|
text = "API-based text extraction not yet implemented" |
|
225
|
|
|
226
|
# Store non-empty results |
|
227
|
if text.strip(): |
|
228
|
results[region] = text.strip() |
|
229
|
|
|
230
|
return results |
|
231
|
|
|
232
|
def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str: |
|
233
|
""" |
|
234
|
Extract text from entire image. |
|
235
|
|
|
236
|
Parameters |
|
237
|
---------- |
|
238
|
image : np.ndarray |
|
239
|
Input image |
|
240
|
detect_regions : bool |
|
241
|
Whether to detect and process text regions separately |
|
242
|
|
|
243
|
Returns |
|
244
|
------- |
|
245
|
str |
|
246
|
Extracted text |
|
247
|
""" |
|
248
|
if detect_regions: |
|
249
|
# Detect regions and extract text from each |
|
250
|
regions = self.detect_text_regions(image) |
|
251
|
region_texts = self.extract_text_from_regions(image, regions) |
|
252
|
|
|
253
|
# Combine text from all regions |
|
254
|
text = "\n".join(region_texts.values()) |
|
255
|
else: |
|
256
|
# Extract text from entire image |
|
257
|
if self.use_local_ocr: |
|
258
|
text = self.extract_text_local(image) |
|
259
|
else: |
|
260
|
text = "API-based text extraction not yet implemented" |
|
261
|
|
|
262
|
return text |
|
263
|
|
|
264
|
def extract_text_from_file( |
|
265
|
self, image_path: Union[str, Path], detect_regions: bool = True |
|
266
|
) -> str: |
|
267
|
""" |
|
268
|
Extract text from image file. |
|
269
|
|
|
270
|
Parameters |
|
271
|
---------- |
|
272
|
image_path : str or Path |
|
273
|
Path to image file |
|
274
|
detect_regions : bool |
|
275
|
Whether to detect and process text regions separately |
|
276
|
|
|
277
|
Returns |
|
278
|
------- |
|
279
|
str |
|
280
|
Extracted text |
|
281
|
""" |
|
282
|
image_path = Path(image_path) |
|
283
|
if not image_path.exists(): |
|
284
|
raise FileNotFoundError(f"Image file not found: {image_path}") |
|
285
|
|
|
286
|
# Load image |
|
287
|
image = cv2.imread(str(image_path)) |
|
288
|
if image is None: |
|
289
|
raise ValueError(f"Failed to load image: {image_path}") |
|
290
|
|
|
291
|
# Extract text |
|
292
|
text = self.extract_text_from_image(image, detect_regions) |
|
293
|
|
|
294
|
return text |
|
295
|
|