PlanOpticon

planopticon / video_processor / extractors / text_extractor.py

Source Blame History 294 lines

287a3bb…	leo	1	"""Text extraction module for frames and diagrams."""
829e24a…	leo	2
287a3bb…	leo	3	import logging
287a3bb…	leo	4	from pathlib import Path
287a3bb…	leo	5	from typing import Dict, List, Optional, Tuple, Union
287a3bb…	leo	6
287a3bb…	leo	7	import cv2
287a3bb…	leo	8	import numpy as np
287a3bb…	leo	9
287a3bb…	leo	10	logger = logging.getLogger(__name__)
287a3bb…	leo	11
829e24a…	leo	12
287a3bb…	leo	13	class TextExtractor:
287a3bb…	leo	14	"""Extract text from images, frames, and diagrams."""
829e24a…	leo	15
287a3bb…	leo	16	def __init__(self, tesseract_path: Optional[str] = None):
287a3bb…	leo	17	"""
287a3bb…	leo	18	Initialize text extractor.
829e24a…	leo	19
287a3bb…	leo	20	Parameters
287a3bb…	leo	21	----------
287a3bb…	leo	22	tesseract_path : str, optional
287a3bb…	leo	23	Path to tesseract executable for local OCR
287a3bb…	leo	24	"""
287a3bb…	leo	25	self.tesseract_path = tesseract_path
829e24a…	leo	26
287a3bb…	leo	27	# Check if we're using tesseract locally
287a3bb…	leo	28	self.use_local_ocr = False
287a3bb…	leo	29	if tesseract_path:
287a3bb…	leo	30	try:
287a3bb…	leo	31	import pytesseract
829e24a…	leo	32
287a3bb…	leo	33	pytesseract.pytesseract.tesseract_cmd = tesseract_path
287a3bb…	leo	34	self.use_local_ocr = True
287a3bb…	leo	35	except ImportError:
287a3bb…	leo	36	logger.warning("pytesseract not installed, local OCR unavailable")
829e24a…	leo	37
287a3bb…	leo	38	def preprocess_image(self, image: np.ndarray) -> np.ndarray:
287a3bb…	leo	39	"""
287a3bb…	leo	40	Preprocess image for better text extraction.
829e24a…	leo	41
287a3bb…	leo	42	Parameters
287a3bb…	leo	43	----------
287a3bb…	leo	44	image : np.ndarray
287a3bb…	leo	45	Input image
829e24a…	leo	46
287a3bb…	leo	47	Returns
287a3bb…	leo	48	-------
287a3bb…	leo	49	np.ndarray
287a3bb…	leo	50	Preprocessed image
287a3bb…	leo	51	"""
287a3bb…	leo	52	# Convert to grayscale if not already
287a3bb…	leo	53	if len(image.shape) == 3:
287a3bb…	leo	54	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
287a3bb…	leo	55	else:
287a3bb…	leo	56	gray = image
829e24a…	leo	57
287a3bb…	leo	58	# Apply adaptive thresholding
287a3bb…	leo	59	thresh = cv2.adaptiveThreshold(
829e24a…	leo	60	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
829e24a…	leo	61	)
829e24a…	leo	62
287a3bb…	leo	63	# Noise removal
287a3bb…	leo	64	kernel = np.ones((1, 1), np.uint8)
287a3bb…	leo	65	opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
829e24a…	leo	66
287a3bb…	leo	67	# Invert back
287a3bb…	leo	68	result = cv2.bitwise_not(opening)
829e24a…	leo	69
287a3bb…	leo	70	return result
829e24a…	leo	71
287a3bb…	leo	72	def extract_text_local(self, image: np.ndarray) -> str:
287a3bb…	leo	73	"""
287a3bb…	leo	74	Extract text from image using local OCR (Tesseract).
829e24a…	leo	75
287a3bb…	leo	76	Parameters
287a3bb…	leo	77	----------
287a3bb…	leo	78	image : np.ndarray
287a3bb…	leo	79	Input image
829e24a…	leo	80
287a3bb…	leo	81	Returns
287a3bb…	leo	82	-------
287a3bb…	leo	83	str
287a3bb…	leo	84	Extracted text
287a3bb…	leo	85	"""
287a3bb…	leo	86	if not self.use_local_ocr:
287a3bb…	leo	87	raise RuntimeError("Local OCR not configured")
829e24a…	leo	88
287a3bb…	leo	89	import pytesseract
829e24a…	leo	90
287a3bb…	leo	91	# Preprocess image
287a3bb…	leo	92	processed = self.preprocess_image(image)
829e24a…	leo	93
287a3bb…	leo	94	# Extract text
287a3bb…	leo	95	text = pytesseract.image_to_string(processed)
829e24a…	leo	96
287a3bb…	leo	97	return text
829e24a…	leo	98
287a3bb…	leo	99	def detect_text_regions(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
287a3bb…	leo	100	"""
287a3bb…	leo	101	Detect potential text regions in image.
829e24a…	leo	102
287a3bb…	leo	103	Parameters
287a3bb…	leo	104	----------
287a3bb…	leo	105	image : np.ndarray
287a3bb…	leo	106	Input image
829e24a…	leo	107
287a3bb…	leo	108	Returns
287a3bb…	leo	109	-------
287a3bb…	leo	110	list
287a3bb…	leo	111	List of bounding boxes for text regions (x, y, w, h)
287a3bb…	leo	112	"""
287a3bb…	leo	113	# Convert to grayscale
287a3bb…	leo	114	if len(image.shape) == 3:
287a3bb…	leo	115	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
287a3bb…	leo	116	else:
287a3bb…	leo	117	gray = image
829e24a…	leo	118
287a3bb…	leo	119	# Apply MSER (Maximally Stable Extremal Regions)
287a3bb…	leo	120	mser = cv2.MSER_create()
287a3bb…	leo	121	regions, _ = mser.detectRegions(gray)
829e24a…	leo	122
287a3bb…	leo	123	# Convert regions to bounding boxes
287a3bb…	leo	124	bboxes = []
287a3bb…	leo	125	for region in regions:
287a3bb…	leo	126	x, y, w, h = cv2.boundingRect(region.reshape(-1, 1, 2))
829e24a…	leo	127
287a3bb…	leo	128	# Apply filtering criteria for text-like regions
287a3bb…	leo	129	aspect_ratio = w / float(h)
287a3bb…	leo	130	if 0.1 < aspect_ratio < 10 and h > 5 and w > 5:
287a3bb…	leo	131	bboxes.append((x, y, w, h))
829e24a…	leo	132
287a3bb…	leo	133	# Merge overlapping boxes
287a3bb…	leo	134	merged_bboxes = self._merge_overlapping_boxes(bboxes)
829e24a…	leo	135
287a3bb…	leo	136	logger.debug(f"Detected {len(merged_bboxes)} text regions")
287a3bb…	leo	137	return merged_bboxes
829e24a…	leo	138
829e24a…	leo	139	def _merge_overlapping_boxes(
829e24a…	leo	140	self, boxes: List[Tuple[int, int, int, int]]
829e24a…	leo	141	) -> List[Tuple[int, int, int, int]]:
287a3bb…	leo	142	"""
287a3bb…	leo	143	Merge overlapping bounding boxes.
829e24a…	leo	144
287a3bb…	leo	145	Parameters
287a3bb…	leo	146	----------
287a3bb…	leo	147	boxes : list
287a3bb…	leo	148	List of bounding boxes (x, y, w, h)
829e24a…	leo	149
287a3bb…	leo	150	Returns
287a3bb…	leo	151	-------
287a3bb…	leo	152	list
287a3bb…	leo	153	Merged bounding boxes
287a3bb…	leo	154	"""
287a3bb…	leo	155	if not boxes:
287a3bb…	leo	156	return []
829e24a…	leo	157
287a3bb…	leo	158	# Sort boxes by x coordinate
287a3bb…	leo	159	sorted_boxes = sorted(boxes, key=lambda b: b[0])
829e24a…	leo	160
287a3bb…	leo	161	merged = []
287a3bb…	leo	162	current = list(sorted_boxes[0])
829e24a…	leo	163
287a3bb…	leo	164	for box in sorted_boxes[1:]:
287a3bb…	leo	165	# Check if current box overlaps with the next one
829e24a…	leo	166	if (
829e24a…	leo	167	current[0] <= box[0] + box[2]
829e24a…	leo	168	and box[0] <= current[0] + current[2]
829e24a…	leo	169	and current[1] <= box[1] + box[3]
829e24a…	leo	170	and box[1] <= current[1] + current[3]
829e24a…	leo	171	):
287a3bb…	leo	172	# Calculate merged box
287a3bb…	leo	173	x1 = min(current[0], box[0])
287a3bb…	leo	174	y1 = min(current[1], box[1])
287a3bb…	leo	175	x2 = max(current[0] + current[2], box[0] + box[2])
287a3bb…	leo	176	y2 = max(current[1] + current[3], box[1] + box[3])
829e24a…	leo	177
287a3bb…	leo	178	# Update current box
287a3bb…	leo	179	current = [x1, y1, x2 - x1, y2 - y1]
287a3bb…	leo	180	else:
287a3bb…	leo	181	# Add current box to merged list and update current
287a3bb…	leo	182	merged.append(tuple(current))
287a3bb…	leo	183	current = list(box)
829e24a…	leo	184
287a3bb…	leo	185	# Add the last box
287a3bb…	leo	186	merged.append(tuple(current))
829e24a…	leo	187
287a3bb…	leo	188	return merged
829e24a…	leo	189
287a3bb…	leo	190	def extract_text_from_regions(
829e24a…	leo	191	self, image: np.ndarray, regions: List[Tuple[int, int, int, int]]
287a3bb…	leo	192	) -> Dict[Tuple[int, int, int, int], str]:
287a3bb…	leo	193	"""
287a3bb…	leo	194	Extract text from specified regions in image.
829e24a…	leo	195
287a3bb…	leo	196	Parameters
287a3bb…	leo	197	----------
287a3bb…	leo	198	image : np.ndarray
287a3bb…	leo	199	Input image
287a3bb…	leo	200	regions : list
287a3bb…	leo	201	List of regions as (x, y, w, h)
829e24a…	leo	202
287a3bb…	leo	203	Returns
287a3bb…	leo	204	-------
287a3bb…	leo	205	dict
287a3bb…	leo	206	Dictionary of {region: text}
287a3bb…	leo	207	"""
287a3bb…	leo	208	results = {}
829e24a…	leo	209
287a3bb…	leo	210	for region in regions:
287a3bb…	leo	211	x, y, w, h = region
829e24a…	leo	212
287a3bb…	leo	213	# Extract region
829e24a…	leo	214	roi = image[y : y + h, x : x + w]
829e24a…	leo	215
287a3bb…	leo	216	# Skip empty regions
287a3bb…	leo	217	if roi.size == 0:
287a3bb…	leo	218	continue
829e24a…	leo	219
287a3bb…	leo	220	# Extract text
287a3bb…	leo	221	if self.use_local_ocr:
287a3bb…	leo	222	text = self.extract_text_local(roi)
287a3bb…	leo	223	else:
287a3bb…	leo	224	text = "API-based text extraction not yet implemented"
829e24a…	leo	225
287a3bb…	leo	226	# Store non-empty results
287a3bb…	leo	227	if text.strip():
287a3bb…	leo	228	results[region] = text.strip()
829e24a…	leo	229
287a3bb…	leo	230	return results
829e24a…	leo	231
287a3bb…	leo	232	def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str:
287a3bb…	leo	233	"""
287a3bb…	leo	234	Extract text from entire image.
829e24a…	leo	235
287a3bb…	leo	236	Parameters
287a3bb…	leo	237	----------
287a3bb…	leo	238	image : np.ndarray
287a3bb…	leo	239	Input image
287a3bb…	leo	240	detect_regions : bool
287a3bb…	leo	241	Whether to detect and process text regions separately
829e24a…	leo	242
287a3bb…	leo	243	Returns
287a3bb…	leo	244	-------
287a3bb…	leo	245	str
287a3bb…	leo	246	Extracted text
287a3bb…	leo	247	"""
287a3bb…	leo	248	if detect_regions:
287a3bb…	leo	249	# Detect regions and extract text from each
287a3bb…	leo	250	regions = self.detect_text_regions(image)
287a3bb…	leo	251	region_texts = self.extract_text_from_regions(image, regions)
829e24a…	leo	252
287a3bb…	leo	253	# Combine text from all regions
287a3bb…	leo	254	text = "\n".join(region_texts.values())
287a3bb…	leo	255	else:
287a3bb…	leo	256	# Extract text from entire image
287a3bb…	leo	257	if self.use_local_ocr:
287a3bb…	leo	258	text = self.extract_text_local(image)
287a3bb…	leo	259	else:
287a3bb…	leo	260	text = "API-based text extraction not yet implemented"
829e24a…	leo	261
287a3bb…	leo	262	return text
829e24a…	leo	263
829e24a…	leo	264	def extract_text_from_file(
829e24a…	leo	265	self, image_path: Union[str, Path], detect_regions: bool = True
829e24a…	leo	266	) -> str:
287a3bb…	leo	267	"""
287a3bb…	leo	268	Extract text from image file.
829e24a…	leo	269
287a3bb…	leo	270	Parameters
287a3bb…	leo	271	----------
287a3bb…	leo	272	image_path : str or Path
287a3bb…	leo	273	Path to image file
287a3bb…	leo	274	detect_regions : bool
287a3bb…	leo	275	Whether to detect and process text regions separately
829e24a…	leo	276
287a3bb…	leo	277	Returns
287a3bb…	leo	278	-------
287a3bb…	leo	279	str
287a3bb…	leo	280	Extracted text
287a3bb…	leo	281	"""
287a3bb…	leo	282	image_path = Path(image_path)
287a3bb…	leo	283	if not image_path.exists():
287a3bb…	leo	284	raise FileNotFoundError(f"Image file not found: {image_path}")
829e24a…	leo	285
287a3bb…	leo	286	# Load image
287a3bb…	leo	287	image = cv2.imread(str(image_path))
287a3bb…	leo	288	if image is None:
287a3bb…	leo	289	raise ValueError(f"Failed to load image: {image_path}")
829e24a…	leo	290
287a3bb…	leo	291	# Extract text
287a3bb…	leo	292	text = self.extract_text_from_image(image, detect_regions)
829e24a…	leo	293
287a3bb…	leo	294	return text

PlanOpticon

Keyboard Shortcuts