PlanOpticon

planopticon / video_processor / extractors / text_extractor.py
Source Blame History 294 lines
287a3bb… leo 1 """Text extraction module for frames and diagrams."""
829e24a… leo 2
287a3bb… leo 3 import logging
287a3bb… leo 4 from pathlib import Path
287a3bb… leo 5 from typing import Dict, List, Optional, Tuple, Union
287a3bb… leo 6
287a3bb… leo 7 import cv2
287a3bb… leo 8 import numpy as np
287a3bb… leo 9
287a3bb… leo 10 logger = logging.getLogger(__name__)
287a3bb… leo 11
829e24a… leo 12
287a3bb… leo 13 class TextExtractor:
287a3bb… leo 14 """Extract text from images, frames, and diagrams."""
829e24a… leo 15
287a3bb… leo 16 def __init__(self, tesseract_path: Optional[str] = None):
287a3bb… leo 17 """
287a3bb… leo 18 Initialize text extractor.
829e24a… leo 19
287a3bb… leo 20 Parameters
287a3bb… leo 21 ----------
287a3bb… leo 22 tesseract_path : str, optional
287a3bb… leo 23 Path to tesseract executable for local OCR
287a3bb… leo 24 """
287a3bb… leo 25 self.tesseract_path = tesseract_path
829e24a… leo 26
287a3bb… leo 27 # Check if we're using tesseract locally
287a3bb… leo 28 self.use_local_ocr = False
287a3bb… leo 29 if tesseract_path:
287a3bb… leo 30 try:
287a3bb… leo 31 import pytesseract
829e24a… leo 32
287a3bb… leo 33 pytesseract.pytesseract.tesseract_cmd = tesseract_path
287a3bb… leo 34 self.use_local_ocr = True
287a3bb… leo 35 except ImportError:
287a3bb… leo 36 logger.warning("pytesseract not installed, local OCR unavailable")
829e24a… leo 37
287a3bb… leo 38 def preprocess_image(self, image: np.ndarray) -> np.ndarray:
287a3bb… leo 39 """
287a3bb… leo 40 Preprocess image for better text extraction.
829e24a… leo 41
287a3bb… leo 42 Parameters
287a3bb… leo 43 ----------
287a3bb… leo 44 image : np.ndarray
287a3bb… leo 45 Input image
829e24a… leo 46
287a3bb… leo 47 Returns
287a3bb… leo 48 -------
287a3bb… leo 49 np.ndarray
287a3bb… leo 50 Preprocessed image
287a3bb… leo 51 """
287a3bb… leo 52 # Convert to grayscale if not already
287a3bb… leo 53 if len(image.shape) == 3:
287a3bb… leo 54 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
287a3bb… leo 55 else:
287a3bb… leo 56 gray = image
829e24a… leo 57
287a3bb… leo 58 # Apply adaptive thresholding
287a3bb… leo 59 thresh = cv2.adaptiveThreshold(
829e24a… leo 60 gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
829e24a… leo 61 )
829e24a… leo 62
287a3bb… leo 63 # Noise removal
287a3bb… leo 64 kernel = np.ones((1, 1), np.uint8)
287a3bb… leo 65 opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
829e24a… leo 66
287a3bb… leo 67 # Invert back
287a3bb… leo 68 result = cv2.bitwise_not(opening)
829e24a… leo 69
287a3bb… leo 70 return result
829e24a… leo 71
287a3bb… leo 72 def extract_text_local(self, image: np.ndarray) -> str:
287a3bb… leo 73 """
287a3bb… leo 74 Extract text from image using local OCR (Tesseract).
829e24a… leo 75
287a3bb… leo 76 Parameters
287a3bb… leo 77 ----------
287a3bb… leo 78 image : np.ndarray
287a3bb… leo 79 Input image
829e24a… leo 80
287a3bb… leo 81 Returns
287a3bb… leo 82 -------
287a3bb… leo 83 str
287a3bb… leo 84 Extracted text
287a3bb… leo 85 """
287a3bb… leo 86 if not self.use_local_ocr:
287a3bb… leo 87 raise RuntimeError("Local OCR not configured")
829e24a… leo 88
287a3bb… leo 89 import pytesseract
829e24a… leo 90
287a3bb… leo 91 # Preprocess image
287a3bb… leo 92 processed = self.preprocess_image(image)
829e24a… leo 93
287a3bb… leo 94 # Extract text
287a3bb… leo 95 text = pytesseract.image_to_string(processed)
829e24a… leo 96
287a3bb… leo 97 return text
829e24a… leo 98
287a3bb… leo 99 def detect_text_regions(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
287a3bb… leo 100 """
287a3bb… leo 101 Detect potential text regions in image.
829e24a… leo 102
287a3bb… leo 103 Parameters
287a3bb… leo 104 ----------
287a3bb… leo 105 image : np.ndarray
287a3bb… leo 106 Input image
829e24a… leo 107
287a3bb… leo 108 Returns
287a3bb… leo 109 -------
287a3bb… leo 110 list
287a3bb… leo 111 List of bounding boxes for text regions (x, y, w, h)
287a3bb… leo 112 """
287a3bb… leo 113 # Convert to grayscale
287a3bb… leo 114 if len(image.shape) == 3:
287a3bb… leo 115 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
287a3bb… leo 116 else:
287a3bb… leo 117 gray = image
829e24a… leo 118
287a3bb… leo 119 # Apply MSER (Maximally Stable Extremal Regions)
287a3bb… leo 120 mser = cv2.MSER_create()
287a3bb… leo 121 regions, _ = mser.detectRegions(gray)
829e24a… leo 122
287a3bb… leo 123 # Convert regions to bounding boxes
287a3bb… leo 124 bboxes = []
287a3bb… leo 125 for region in regions:
287a3bb… leo 126 x, y, w, h = cv2.boundingRect(region.reshape(-1, 1, 2))
829e24a… leo 127
287a3bb… leo 128 # Apply filtering criteria for text-like regions
287a3bb… leo 129 aspect_ratio = w / float(h)
287a3bb… leo 130 if 0.1 < aspect_ratio < 10 and h > 5 and w > 5:
287a3bb… leo 131 bboxes.append((x, y, w, h))
829e24a… leo 132
287a3bb… leo 133 # Merge overlapping boxes
287a3bb… leo 134 merged_bboxes = self._merge_overlapping_boxes(bboxes)
829e24a… leo 135
287a3bb… leo 136 logger.debug(f"Detected {len(merged_bboxes)} text regions")
287a3bb… leo 137 return merged_bboxes
829e24a… leo 138
829e24a… leo 139 def _merge_overlapping_boxes(
829e24a… leo 140 self, boxes: List[Tuple[int, int, int, int]]
829e24a… leo 141 ) -> List[Tuple[int, int, int, int]]:
287a3bb… leo 142 """
287a3bb… leo 143 Merge overlapping bounding boxes.
829e24a… leo 144
287a3bb… leo 145 Parameters
287a3bb… leo 146 ----------
287a3bb… leo 147 boxes : list
287a3bb… leo 148 List of bounding boxes (x, y, w, h)
829e24a… leo 149
287a3bb… leo 150 Returns
287a3bb… leo 151 -------
287a3bb… leo 152 list
287a3bb… leo 153 Merged bounding boxes
287a3bb… leo 154 """
287a3bb… leo 155 if not boxes:
287a3bb… leo 156 return []
829e24a… leo 157
287a3bb… leo 158 # Sort boxes by x coordinate
287a3bb… leo 159 sorted_boxes = sorted(boxes, key=lambda b: b[0])
829e24a… leo 160
287a3bb… leo 161 merged = []
287a3bb… leo 162 current = list(sorted_boxes[0])
829e24a… leo 163
287a3bb… leo 164 for box in sorted_boxes[1:]:
287a3bb… leo 165 # Check if current box overlaps with the next one
829e24a… leo 166 if (
829e24a… leo 167 current[0] <= box[0] + box[2]
829e24a… leo 168 and box[0] <= current[0] + current[2]
829e24a… leo 169 and current[1] <= box[1] + box[3]
829e24a… leo 170 and box[1] <= current[1] + current[3]
829e24a… leo 171 ):
287a3bb… leo 172 # Calculate merged box
287a3bb… leo 173 x1 = min(current[0], box[0])
287a3bb… leo 174 y1 = min(current[1], box[1])
287a3bb… leo 175 x2 = max(current[0] + current[2], box[0] + box[2])
287a3bb… leo 176 y2 = max(current[1] + current[3], box[1] + box[3])
829e24a… leo 177
287a3bb… leo 178 # Update current box
287a3bb… leo 179 current = [x1, y1, x2 - x1, y2 - y1]
287a3bb… leo 180 else:
287a3bb… leo 181 # Add current box to merged list and update current
287a3bb… leo 182 merged.append(tuple(current))
287a3bb… leo 183 current = list(box)
829e24a… leo 184
287a3bb… leo 185 # Add the last box
287a3bb… leo 186 merged.append(tuple(current))
829e24a… leo 187
287a3bb… leo 188 return merged
829e24a… leo 189
287a3bb… leo 190 def extract_text_from_regions(
829e24a… leo 191 self, image: np.ndarray, regions: List[Tuple[int, int, int, int]]
287a3bb… leo 192 ) -> Dict[Tuple[int, int, int, int], str]:
287a3bb… leo 193 """
287a3bb… leo 194 Extract text from specified regions in image.
829e24a… leo 195
287a3bb… leo 196 Parameters
287a3bb… leo 197 ----------
287a3bb… leo 198 image : np.ndarray
287a3bb… leo 199 Input image
287a3bb… leo 200 regions : list
287a3bb… leo 201 List of regions as (x, y, w, h)
829e24a… leo 202
287a3bb… leo 203 Returns
287a3bb… leo 204 -------
287a3bb… leo 205 dict
287a3bb… leo 206 Dictionary of {region: text}
287a3bb… leo 207 """
287a3bb… leo 208 results = {}
829e24a… leo 209
287a3bb… leo 210 for region in regions:
287a3bb… leo 211 x, y, w, h = region
829e24a… leo 212
287a3bb… leo 213 # Extract region
829e24a… leo 214 roi = image[y : y + h, x : x + w]
829e24a… leo 215
287a3bb… leo 216 # Skip empty regions
287a3bb… leo 217 if roi.size == 0:
287a3bb… leo 218 continue
829e24a… leo 219
287a3bb… leo 220 # Extract text
287a3bb… leo 221 if self.use_local_ocr:
287a3bb… leo 222 text = self.extract_text_local(roi)
287a3bb… leo 223 else:
287a3bb… leo 224 text = "API-based text extraction not yet implemented"
829e24a… leo 225
287a3bb… leo 226 # Store non-empty results
287a3bb… leo 227 if text.strip():
287a3bb… leo 228 results[region] = text.strip()
829e24a… leo 229
287a3bb… leo 230 return results
829e24a… leo 231
287a3bb… leo 232 def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str:
287a3bb… leo 233 """
287a3bb… leo 234 Extract text from entire image.
829e24a… leo 235
287a3bb… leo 236 Parameters
287a3bb… leo 237 ----------
287a3bb… leo 238 image : np.ndarray
287a3bb… leo 239 Input image
287a3bb… leo 240 detect_regions : bool
287a3bb… leo 241 Whether to detect and process text regions separately
829e24a… leo 242
287a3bb… leo 243 Returns
287a3bb… leo 244 -------
287a3bb… leo 245 str
287a3bb… leo 246 Extracted text
287a3bb… leo 247 """
287a3bb… leo 248 if detect_regions:
287a3bb… leo 249 # Detect regions and extract text from each
287a3bb… leo 250 regions = self.detect_text_regions(image)
287a3bb… leo 251 region_texts = self.extract_text_from_regions(image, regions)
829e24a… leo 252
287a3bb… leo 253 # Combine text from all regions
287a3bb… leo 254 text = "\n".join(region_texts.values())
287a3bb… leo 255 else:
287a3bb… leo 256 # Extract text from entire image
287a3bb… leo 257 if self.use_local_ocr:
287a3bb… leo 258 text = self.extract_text_local(image)
287a3bb… leo 259 else:
287a3bb… leo 260 text = "API-based text extraction not yet implemented"
829e24a… leo 261
287a3bb… leo 262 return text
829e24a… leo 263
829e24a… leo 264 def extract_text_from_file(
829e24a… leo 265 self, image_path: Union[str, Path], detect_regions: bool = True
829e24a… leo 266 ) -> str:
287a3bb… leo 267 """
287a3bb… leo 268 Extract text from image file.
829e24a… leo 269
287a3bb… leo 270 Parameters
287a3bb… leo 271 ----------
287a3bb… leo 272 image_path : str or Path
287a3bb… leo 273 Path to image file
287a3bb… leo 274 detect_regions : bool
287a3bb… leo 275 Whether to detect and process text regions separately
829e24a… leo 276
287a3bb… leo 277 Returns
287a3bb… leo 278 -------
287a3bb… leo 279 str
287a3bb… leo 280 Extracted text
287a3bb… leo 281 """
287a3bb… leo 282 image_path = Path(image_path)
287a3bb… leo 283 if not image_path.exists():
287a3bb… leo 284 raise FileNotFoundError(f"Image file not found: {image_path}")
829e24a… leo 285
287a3bb… leo 286 # Load image
287a3bb… leo 287 image = cv2.imread(str(image_path))
287a3bb… leo 288 if image is None:
287a3bb… leo 289 raise ValueError(f"Failed to load image: {image_path}")
829e24a… leo 290
287a3bb… leo 291 # Extract text
287a3bb… leo 292 text = self.extract_text_from_image(image, detect_regions)
829e24a… leo 293
287a3bb… leo 294 return text

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button