PlanOpticon

planopticon / video_processor / extractors / text_extractor.py
Blame History Raw 295 lines
1
"""Text extraction module for frames and diagrams."""
2
3
import logging
4
from pathlib import Path
5
from typing import Dict, List, Optional, Tuple, Union
6
7
import cv2
8
import numpy as np
9
10
logger = logging.getLogger(__name__)
11
12
13
class TextExtractor:
14
"""Extract text from images, frames, and diagrams."""
15
16
def __init__(self, tesseract_path: Optional[str] = None):
17
"""
18
Initialize text extractor.
19
20
Parameters
21
----------
22
tesseract_path : str, optional
23
Path to tesseract executable for local OCR
24
"""
25
self.tesseract_path = tesseract_path
26
27
# Check if we're using tesseract locally
28
self.use_local_ocr = False
29
if tesseract_path:
30
try:
31
import pytesseract
32
33
pytesseract.pytesseract.tesseract_cmd = tesseract_path
34
self.use_local_ocr = True
35
except ImportError:
36
logger.warning("pytesseract not installed, local OCR unavailable")
37
38
def preprocess_image(self, image: np.ndarray) -> np.ndarray:
39
"""
40
Preprocess image for better text extraction.
41
42
Parameters
43
----------
44
image : np.ndarray
45
Input image
46
47
Returns
48
-------
49
np.ndarray
50
Preprocessed image
51
"""
52
# Convert to grayscale if not already
53
if len(image.shape) == 3:
54
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
55
else:
56
gray = image
57
58
# Apply adaptive thresholding
59
thresh = cv2.adaptiveThreshold(
60
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
61
)
62
63
# Noise removal
64
kernel = np.ones((1, 1), np.uint8)
65
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
66
67
# Invert back
68
result = cv2.bitwise_not(opening)
69
70
return result
71
72
def extract_text_local(self, image: np.ndarray) -> str:
73
"""
74
Extract text from image using local OCR (Tesseract).
75
76
Parameters
77
----------
78
image : np.ndarray
79
Input image
80
81
Returns
82
-------
83
str
84
Extracted text
85
"""
86
if not self.use_local_ocr:
87
raise RuntimeError("Local OCR not configured")
88
89
import pytesseract
90
91
# Preprocess image
92
processed = self.preprocess_image(image)
93
94
# Extract text
95
text = pytesseract.image_to_string(processed)
96
97
return text
98
99
def detect_text_regions(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
100
"""
101
Detect potential text regions in image.
102
103
Parameters
104
----------
105
image : np.ndarray
106
Input image
107
108
Returns
109
-------
110
list
111
List of bounding boxes for text regions (x, y, w, h)
112
"""
113
# Convert to grayscale
114
if len(image.shape) == 3:
115
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
116
else:
117
gray = image
118
119
# Apply MSER (Maximally Stable Extremal Regions)
120
mser = cv2.MSER_create()
121
regions, _ = mser.detectRegions(gray)
122
123
# Convert regions to bounding boxes
124
bboxes = []
125
for region in regions:
126
x, y, w, h = cv2.boundingRect(region.reshape(-1, 1, 2))
127
128
# Apply filtering criteria for text-like regions
129
aspect_ratio = w / float(h)
130
if 0.1 < aspect_ratio < 10 and h > 5 and w > 5:
131
bboxes.append((x, y, w, h))
132
133
# Merge overlapping boxes
134
merged_bboxes = self._merge_overlapping_boxes(bboxes)
135
136
logger.debug(f"Detected {len(merged_bboxes)} text regions")
137
return merged_bboxes
138
139
def _merge_overlapping_boxes(
140
self, boxes: List[Tuple[int, int, int, int]]
141
) -> List[Tuple[int, int, int, int]]:
142
"""
143
Merge overlapping bounding boxes.
144
145
Parameters
146
----------
147
boxes : list
148
List of bounding boxes (x, y, w, h)
149
150
Returns
151
-------
152
list
153
Merged bounding boxes
154
"""
155
if not boxes:
156
return []
157
158
# Sort boxes by x coordinate
159
sorted_boxes = sorted(boxes, key=lambda b: b[0])
160
161
merged = []
162
current = list(sorted_boxes[0])
163
164
for box in sorted_boxes[1:]:
165
# Check if current box overlaps with the next one
166
if (
167
current[0] <= box[0] + box[2]
168
and box[0] <= current[0] + current[2]
169
and current[1] <= box[1] + box[3]
170
and box[1] <= current[1] + current[3]
171
):
172
# Calculate merged box
173
x1 = min(current[0], box[0])
174
y1 = min(current[1], box[1])
175
x2 = max(current[0] + current[2], box[0] + box[2])
176
y2 = max(current[1] + current[3], box[1] + box[3])
177
178
# Update current box
179
current = [x1, y1, x2 - x1, y2 - y1]
180
else:
181
# Add current box to merged list and update current
182
merged.append(tuple(current))
183
current = list(box)
184
185
# Add the last box
186
merged.append(tuple(current))
187
188
return merged
189
190
def extract_text_from_regions(
191
self, image: np.ndarray, regions: List[Tuple[int, int, int, int]]
192
) -> Dict[Tuple[int, int, int, int], str]:
193
"""
194
Extract text from specified regions in image.
195
196
Parameters
197
----------
198
image : np.ndarray
199
Input image
200
regions : list
201
List of regions as (x, y, w, h)
202
203
Returns
204
-------
205
dict
206
Dictionary of {region: text}
207
"""
208
results = {}
209
210
for region in regions:
211
x, y, w, h = region
212
213
# Extract region
214
roi = image[y : y + h, x : x + w]
215
216
# Skip empty regions
217
if roi.size == 0:
218
continue
219
220
# Extract text
221
if self.use_local_ocr:
222
text = self.extract_text_local(roi)
223
else:
224
text = "API-based text extraction not yet implemented"
225
226
# Store non-empty results
227
if text.strip():
228
results[region] = text.strip()
229
230
return results
231
232
def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str:
233
"""
234
Extract text from entire image.
235
236
Parameters
237
----------
238
image : np.ndarray
239
Input image
240
detect_regions : bool
241
Whether to detect and process text regions separately
242
243
Returns
244
-------
245
str
246
Extracted text
247
"""
248
if detect_regions:
249
# Detect regions and extract text from each
250
regions = self.detect_text_regions(image)
251
region_texts = self.extract_text_from_regions(image, regions)
252
253
# Combine text from all regions
254
text = "\n".join(region_texts.values())
255
else:
256
# Extract text from entire image
257
if self.use_local_ocr:
258
text = self.extract_text_local(image)
259
else:
260
text = "API-based text extraction not yet implemented"
261
262
return text
263
264
def extract_text_from_file(
265
self, image_path: Union[str, Path], detect_regions: bool = True
266
) -> str:
267
"""
268
Extract text from image file.
269
270
Parameters
271
----------
272
image_path : str or Path
273
Path to image file
274
detect_regions : bool
275
Whether to detect and process text regions separately
276
277
Returns
278
-------
279
str
280
Extracted text
281
"""
282
image_path = Path(image_path)
283
if not image_path.exists():
284
raise FileNotFoundError(f"Image file not found: {image_path}")
285
286
# Load image
287
image = cv2.imread(str(image_path))
288
if image is None:
289
raise ValueError(f"Failed to load image: {image_path}")
290
291
# Extract text
292
text = self.extract_text_from_image(image, detect_regions)
293
294
return text
295

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button