|
287a3bb…
|
leo
|
1 |
"""Text extraction module for frames and diagrams.""" |
|
829e24a…
|
leo
|
2 |
|
|
287a3bb…
|
leo
|
3 |
import logging |
|
287a3bb…
|
leo
|
4 |
from pathlib import Path |
|
287a3bb…
|
leo
|
5 |
from typing import Dict, List, Optional, Tuple, Union |
|
287a3bb…
|
leo
|
6 |
|
|
287a3bb…
|
leo
|
7 |
import cv2 |
|
287a3bb…
|
leo
|
8 |
import numpy as np |
|
287a3bb…
|
leo
|
9 |
|
|
287a3bb…
|
leo
|
10 |
logger = logging.getLogger(__name__) |
|
287a3bb…
|
leo
|
11 |
|
|
829e24a…
|
leo
|
12 |
|
|
287a3bb…
|
leo
|
13 |
class TextExtractor: |
|
287a3bb…
|
leo
|
14 |
"""Extract text from images, frames, and diagrams.""" |
|
829e24a…
|
leo
|
15 |
|
|
287a3bb…
|
leo
|
16 |
def __init__(self, tesseract_path: Optional[str] = None): |
|
287a3bb…
|
leo
|
17 |
""" |
|
287a3bb…
|
leo
|
18 |
Initialize text extractor. |
|
829e24a…
|
leo
|
19 |
|
|
287a3bb…
|
leo
|
20 |
Parameters |
|
287a3bb…
|
leo
|
21 |
---------- |
|
287a3bb…
|
leo
|
22 |
tesseract_path : str, optional |
|
287a3bb…
|
leo
|
23 |
Path to tesseract executable for local OCR |
|
287a3bb…
|
leo
|
24 |
""" |
|
287a3bb…
|
leo
|
25 |
self.tesseract_path = tesseract_path |
|
829e24a…
|
leo
|
26 |
|
|
287a3bb…
|
leo
|
27 |
# Check if we're using tesseract locally |
|
287a3bb…
|
leo
|
28 |
self.use_local_ocr = False |
|
287a3bb…
|
leo
|
29 |
if tesseract_path: |
|
287a3bb…
|
leo
|
30 |
try: |
|
287a3bb…
|
leo
|
31 |
import pytesseract |
|
829e24a…
|
leo
|
32 |
|
|
287a3bb…
|
leo
|
33 |
pytesseract.pytesseract.tesseract_cmd = tesseract_path |
|
287a3bb…
|
leo
|
34 |
self.use_local_ocr = True |
|
287a3bb…
|
leo
|
35 |
except ImportError: |
|
287a3bb…
|
leo
|
36 |
logger.warning("pytesseract not installed, local OCR unavailable") |
|
829e24a…
|
leo
|
37 |
|
|
287a3bb…
|
leo
|
38 |
def preprocess_image(self, image: np.ndarray) -> np.ndarray: |
|
287a3bb…
|
leo
|
39 |
""" |
|
287a3bb…
|
leo
|
40 |
Preprocess image for better text extraction. |
|
829e24a…
|
leo
|
41 |
|
|
287a3bb…
|
leo
|
42 |
Parameters |
|
287a3bb…
|
leo
|
43 |
---------- |
|
287a3bb…
|
leo
|
44 |
image : np.ndarray |
|
287a3bb…
|
leo
|
45 |
Input image |
|
829e24a…
|
leo
|
46 |
|
|
287a3bb…
|
leo
|
47 |
Returns |
|
287a3bb…
|
leo
|
48 |
------- |
|
287a3bb…
|
leo
|
49 |
np.ndarray |
|
287a3bb…
|
leo
|
50 |
Preprocessed image |
|
287a3bb…
|
leo
|
51 |
""" |
|
287a3bb…
|
leo
|
52 |
# Convert to grayscale if not already |
|
287a3bb…
|
leo
|
53 |
if len(image.shape) == 3: |
|
287a3bb…
|
leo
|
54 |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
287a3bb…
|
leo
|
55 |
else: |
|
287a3bb…
|
leo
|
56 |
gray = image |
|
829e24a…
|
leo
|
57 |
|
|
287a3bb…
|
leo
|
58 |
# Apply adaptive thresholding |
|
287a3bb…
|
leo
|
59 |
thresh = cv2.adaptiveThreshold( |
|
829e24a…
|
leo
|
60 |
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2 |
|
829e24a…
|
leo
|
61 |
) |
|
829e24a…
|
leo
|
62 |
|
|
287a3bb…
|
leo
|
63 |
# Noise removal |
|
287a3bb…
|
leo
|
64 |
kernel = np.ones((1, 1), np.uint8) |
|
287a3bb…
|
leo
|
65 |
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) |
|
829e24a…
|
leo
|
66 |
|
|
287a3bb…
|
leo
|
67 |
# Invert back |
|
287a3bb…
|
leo
|
68 |
result = cv2.bitwise_not(opening) |
|
829e24a…
|
leo
|
69 |
|
|
287a3bb…
|
leo
|
70 |
return result |
|
829e24a…
|
leo
|
71 |
|
|
287a3bb…
|
leo
|
72 |
def extract_text_local(self, image: np.ndarray) -> str: |
|
287a3bb…
|
leo
|
73 |
""" |
|
287a3bb…
|
leo
|
74 |
Extract text from image using local OCR (Tesseract). |
|
829e24a…
|
leo
|
75 |
|
|
287a3bb…
|
leo
|
76 |
Parameters |
|
287a3bb…
|
leo
|
77 |
---------- |
|
287a3bb…
|
leo
|
78 |
image : np.ndarray |
|
287a3bb…
|
leo
|
79 |
Input image |
|
829e24a…
|
leo
|
80 |
|
|
287a3bb…
|
leo
|
81 |
Returns |
|
287a3bb…
|
leo
|
82 |
------- |
|
287a3bb…
|
leo
|
83 |
str |
|
287a3bb…
|
leo
|
84 |
Extracted text |
|
287a3bb…
|
leo
|
85 |
""" |
|
287a3bb…
|
leo
|
86 |
if not self.use_local_ocr: |
|
287a3bb…
|
leo
|
87 |
raise RuntimeError("Local OCR not configured") |
|
829e24a…
|
leo
|
88 |
|
|
287a3bb…
|
leo
|
89 |
import pytesseract |
|
829e24a…
|
leo
|
90 |
|
|
287a3bb…
|
leo
|
91 |
# Preprocess image |
|
287a3bb…
|
leo
|
92 |
processed = self.preprocess_image(image) |
|
829e24a…
|
leo
|
93 |
|
|
287a3bb…
|
leo
|
94 |
# Extract text |
|
287a3bb…
|
leo
|
95 |
text = pytesseract.image_to_string(processed) |
|
829e24a…
|
leo
|
96 |
|
|
287a3bb…
|
leo
|
97 |
return text |
|
829e24a…
|
leo
|
98 |
|
|
287a3bb…
|
leo
|
99 |
def detect_text_regions(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]: |
|
287a3bb…
|
leo
|
100 |
""" |
|
287a3bb…
|
leo
|
101 |
Detect potential text regions in image. |
|
829e24a…
|
leo
|
102 |
|
|
287a3bb…
|
leo
|
103 |
Parameters |
|
287a3bb…
|
leo
|
104 |
---------- |
|
287a3bb…
|
leo
|
105 |
image : np.ndarray |
|
287a3bb…
|
leo
|
106 |
Input image |
|
829e24a…
|
leo
|
107 |
|
|
287a3bb…
|
leo
|
108 |
Returns |
|
287a3bb…
|
leo
|
109 |
------- |
|
287a3bb…
|
leo
|
110 |
list |
|
287a3bb…
|
leo
|
111 |
List of bounding boxes for text regions (x, y, w, h) |
|
287a3bb…
|
leo
|
112 |
""" |
|
287a3bb…
|
leo
|
113 |
# Convert to grayscale |
|
287a3bb…
|
leo
|
114 |
if len(image.shape) == 3: |
|
287a3bb…
|
leo
|
115 |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
287a3bb…
|
leo
|
116 |
else: |
|
287a3bb…
|
leo
|
117 |
gray = image |
|
829e24a…
|
leo
|
118 |
|
|
287a3bb…
|
leo
|
119 |
# Apply MSER (Maximally Stable Extremal Regions) |
|
287a3bb…
|
leo
|
120 |
mser = cv2.MSER_create() |
|
287a3bb…
|
leo
|
121 |
regions, _ = mser.detectRegions(gray) |
|
829e24a…
|
leo
|
122 |
|
|
287a3bb…
|
leo
|
123 |
# Convert regions to bounding boxes |
|
287a3bb…
|
leo
|
124 |
bboxes = [] |
|
287a3bb…
|
leo
|
125 |
for region in regions: |
|
287a3bb…
|
leo
|
126 |
x, y, w, h = cv2.boundingRect(region.reshape(-1, 1, 2)) |
|
829e24a…
|
leo
|
127 |
|
|
287a3bb…
|
leo
|
128 |
# Apply filtering criteria for text-like regions |
|
287a3bb…
|
leo
|
129 |
aspect_ratio = w / float(h) |
|
287a3bb…
|
leo
|
130 |
if 0.1 < aspect_ratio < 10 and h > 5 and w > 5: |
|
287a3bb…
|
leo
|
131 |
bboxes.append((x, y, w, h)) |
|
829e24a…
|
leo
|
132 |
|
|
287a3bb…
|
leo
|
133 |
# Merge overlapping boxes |
|
287a3bb…
|
leo
|
134 |
merged_bboxes = self._merge_overlapping_boxes(bboxes) |
|
829e24a…
|
leo
|
135 |
|
|
287a3bb…
|
leo
|
136 |
logger.debug(f"Detected {len(merged_bboxes)} text regions") |
|
287a3bb…
|
leo
|
137 |
return merged_bboxes |
|
829e24a…
|
leo
|
138 |
|
|
829e24a…
|
leo
|
139 |
def _merge_overlapping_boxes( |
|
829e24a…
|
leo
|
140 |
self, boxes: List[Tuple[int, int, int, int]] |
|
829e24a…
|
leo
|
141 |
) -> List[Tuple[int, int, int, int]]: |
|
287a3bb…
|
leo
|
142 |
""" |
|
287a3bb…
|
leo
|
143 |
Merge overlapping bounding boxes. |
|
829e24a…
|
leo
|
144 |
|
|
287a3bb…
|
leo
|
145 |
Parameters |
|
287a3bb…
|
leo
|
146 |
---------- |
|
287a3bb…
|
leo
|
147 |
boxes : list |
|
287a3bb…
|
leo
|
148 |
List of bounding boxes (x, y, w, h) |
|
829e24a…
|
leo
|
149 |
|
|
287a3bb…
|
leo
|
150 |
Returns |
|
287a3bb…
|
leo
|
151 |
------- |
|
287a3bb…
|
leo
|
152 |
list |
|
287a3bb…
|
leo
|
153 |
Merged bounding boxes |
|
287a3bb…
|
leo
|
154 |
""" |
|
287a3bb…
|
leo
|
155 |
if not boxes: |
|
287a3bb…
|
leo
|
156 |
return [] |
|
829e24a…
|
leo
|
157 |
|
|
287a3bb…
|
leo
|
158 |
# Sort boxes by x coordinate |
|
287a3bb…
|
leo
|
159 |
sorted_boxes = sorted(boxes, key=lambda b: b[0]) |
|
829e24a…
|
leo
|
160 |
|
|
287a3bb…
|
leo
|
161 |
merged = [] |
|
287a3bb…
|
leo
|
162 |
current = list(sorted_boxes[0]) |
|
829e24a…
|
leo
|
163 |
|
|
287a3bb…
|
leo
|
164 |
for box in sorted_boxes[1:]: |
|
287a3bb…
|
leo
|
165 |
# Check if current box overlaps with the next one |
|
829e24a…
|
leo
|
166 |
if ( |
|
829e24a…
|
leo
|
167 |
current[0] <= box[0] + box[2] |
|
829e24a…
|
leo
|
168 |
and box[0] <= current[0] + current[2] |
|
829e24a…
|
leo
|
169 |
and current[1] <= box[1] + box[3] |
|
829e24a…
|
leo
|
170 |
and box[1] <= current[1] + current[3] |
|
829e24a…
|
leo
|
171 |
): |
|
287a3bb…
|
leo
|
172 |
# Calculate merged box |
|
287a3bb…
|
leo
|
173 |
x1 = min(current[0], box[0]) |
|
287a3bb…
|
leo
|
174 |
y1 = min(current[1], box[1]) |
|
287a3bb…
|
leo
|
175 |
x2 = max(current[0] + current[2], box[0] + box[2]) |
|
287a3bb…
|
leo
|
176 |
y2 = max(current[1] + current[3], box[1] + box[3]) |
|
829e24a…
|
leo
|
177 |
|
|
287a3bb…
|
leo
|
178 |
# Update current box |
|
287a3bb…
|
leo
|
179 |
current = [x1, y1, x2 - x1, y2 - y1] |
|
287a3bb…
|
leo
|
180 |
else: |
|
287a3bb…
|
leo
|
181 |
# Add current box to merged list and update current |
|
287a3bb…
|
leo
|
182 |
merged.append(tuple(current)) |
|
287a3bb…
|
leo
|
183 |
current = list(box) |
|
829e24a…
|
leo
|
184 |
|
|
287a3bb…
|
leo
|
185 |
# Add the last box |
|
287a3bb…
|
leo
|
186 |
merged.append(tuple(current)) |
|
829e24a…
|
leo
|
187 |
|
|
287a3bb…
|
leo
|
188 |
return merged |
|
829e24a…
|
leo
|
189 |
|
|
287a3bb…
|
leo
|
190 |
def extract_text_from_regions( |
|
829e24a…
|
leo
|
191 |
self, image: np.ndarray, regions: List[Tuple[int, int, int, int]] |
|
287a3bb…
|
leo
|
192 |
) -> Dict[Tuple[int, int, int, int], str]: |
|
287a3bb…
|
leo
|
193 |
""" |
|
287a3bb…
|
leo
|
194 |
Extract text from specified regions in image. |
|
829e24a…
|
leo
|
195 |
|
|
287a3bb…
|
leo
|
196 |
Parameters |
|
287a3bb…
|
leo
|
197 |
---------- |
|
287a3bb…
|
leo
|
198 |
image : np.ndarray |
|
287a3bb…
|
leo
|
199 |
Input image |
|
287a3bb…
|
leo
|
200 |
regions : list |
|
287a3bb…
|
leo
|
201 |
List of regions as (x, y, w, h) |
|
829e24a…
|
leo
|
202 |
|
|
287a3bb…
|
leo
|
203 |
Returns |
|
287a3bb…
|
leo
|
204 |
------- |
|
287a3bb…
|
leo
|
205 |
dict |
|
287a3bb…
|
leo
|
206 |
Dictionary of {region: text} |
|
287a3bb…
|
leo
|
207 |
""" |
|
287a3bb…
|
leo
|
208 |
results = {} |
|
829e24a…
|
leo
|
209 |
|
|
287a3bb…
|
leo
|
210 |
for region in regions: |
|
287a3bb…
|
leo
|
211 |
x, y, w, h = region |
|
829e24a…
|
leo
|
212 |
|
|
287a3bb…
|
leo
|
213 |
# Extract region |
|
829e24a…
|
leo
|
214 |
roi = image[y : y + h, x : x + w] |
|
829e24a…
|
leo
|
215 |
|
|
287a3bb…
|
leo
|
216 |
# Skip empty regions |
|
287a3bb…
|
leo
|
217 |
if roi.size == 0: |
|
287a3bb…
|
leo
|
218 |
continue |
|
829e24a…
|
leo
|
219 |
|
|
287a3bb…
|
leo
|
220 |
# Extract text |
|
287a3bb…
|
leo
|
221 |
if self.use_local_ocr: |
|
287a3bb…
|
leo
|
222 |
text = self.extract_text_local(roi) |
|
287a3bb…
|
leo
|
223 |
else: |
|
287a3bb…
|
leo
|
224 |
text = "API-based text extraction not yet implemented" |
|
829e24a…
|
leo
|
225 |
|
|
287a3bb…
|
leo
|
226 |
# Store non-empty results |
|
287a3bb…
|
leo
|
227 |
if text.strip(): |
|
287a3bb…
|
leo
|
228 |
results[region] = text.strip() |
|
829e24a…
|
leo
|
229 |
|
|
287a3bb…
|
leo
|
230 |
return results |
|
829e24a…
|
leo
|
231 |
|
|
287a3bb…
|
leo
|
232 |
def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str: |
|
287a3bb…
|
leo
|
233 |
""" |
|
287a3bb…
|
leo
|
234 |
Extract text from entire image. |
|
829e24a…
|
leo
|
235 |
|
|
287a3bb…
|
leo
|
236 |
Parameters |
|
287a3bb…
|
leo
|
237 |
---------- |
|
287a3bb…
|
leo
|
238 |
image : np.ndarray |
|
287a3bb…
|
leo
|
239 |
Input image |
|
287a3bb…
|
leo
|
240 |
detect_regions : bool |
|
287a3bb…
|
leo
|
241 |
Whether to detect and process text regions separately |
|
829e24a…
|
leo
|
242 |
|
|
287a3bb…
|
leo
|
243 |
Returns |
|
287a3bb…
|
leo
|
244 |
------- |
|
287a3bb…
|
leo
|
245 |
str |
|
287a3bb…
|
leo
|
246 |
Extracted text |
|
287a3bb…
|
leo
|
247 |
""" |
|
287a3bb…
|
leo
|
248 |
if detect_regions: |
|
287a3bb…
|
leo
|
249 |
# Detect regions and extract text from each |
|
287a3bb…
|
leo
|
250 |
regions = self.detect_text_regions(image) |
|
287a3bb…
|
leo
|
251 |
region_texts = self.extract_text_from_regions(image, regions) |
|
829e24a…
|
leo
|
252 |
|
|
287a3bb…
|
leo
|
253 |
# Combine text from all regions |
|
287a3bb…
|
leo
|
254 |
text = "\n".join(region_texts.values()) |
|
287a3bb…
|
leo
|
255 |
else: |
|
287a3bb…
|
leo
|
256 |
# Extract text from entire image |
|
287a3bb…
|
leo
|
257 |
if self.use_local_ocr: |
|
287a3bb…
|
leo
|
258 |
text = self.extract_text_local(image) |
|
287a3bb…
|
leo
|
259 |
else: |
|
287a3bb…
|
leo
|
260 |
text = "API-based text extraction not yet implemented" |
|
829e24a…
|
leo
|
261 |
|
|
287a3bb…
|
leo
|
262 |
return text |
|
829e24a…
|
leo
|
263 |
|
|
829e24a…
|
leo
|
264 |
def extract_text_from_file( |
|
829e24a…
|
leo
|
265 |
self, image_path: Union[str, Path], detect_regions: bool = True |
|
829e24a…
|
leo
|
266 |
) -> str: |
|
287a3bb…
|
leo
|
267 |
""" |
|
287a3bb…
|
leo
|
268 |
Extract text from image file. |
|
829e24a…
|
leo
|
269 |
|
|
287a3bb…
|
leo
|
270 |
Parameters |
|
287a3bb…
|
leo
|
271 |
---------- |
|
287a3bb…
|
leo
|
272 |
image_path : str or Path |
|
287a3bb…
|
leo
|
273 |
Path to image file |
|
287a3bb…
|
leo
|
274 |
detect_regions : bool |
|
287a3bb…
|
leo
|
275 |
Whether to detect and process text regions separately |
|
829e24a…
|
leo
|
276 |
|
|
287a3bb…
|
leo
|
277 |
Returns |
|
287a3bb…
|
leo
|
278 |
------- |
|
287a3bb…
|
leo
|
279 |
str |
|
287a3bb…
|
leo
|
280 |
Extracted text |
|
287a3bb…
|
leo
|
281 |
""" |
|
287a3bb…
|
leo
|
282 |
image_path = Path(image_path) |
|
287a3bb…
|
leo
|
283 |
if not image_path.exists(): |
|
287a3bb…
|
leo
|
284 |
raise FileNotFoundError(f"Image file not found: {image_path}") |
|
829e24a…
|
leo
|
285 |
|
|
287a3bb…
|
leo
|
286 |
# Load image |
|
287a3bb…
|
leo
|
287 |
image = cv2.imread(str(image_path)) |
|
287a3bb…
|
leo
|
288 |
if image is None: |
|
287a3bb…
|
leo
|
289 |
raise ValueError(f"Failed to load image: {image_path}") |
|
829e24a…
|
leo
|
290 |
|
|
287a3bb…
|
leo
|
291 |
# Extract text |
|
287a3bb…
|
leo
|
292 |
text = self.extract_text_from_image(image, detect_regions) |
|
829e24a…
|
leo
|
293 |
|
|
287a3bb…
|
leo
|
294 |
return text |