|
0981a08…
|
noreply
|
1 |
"""Taxonomy classifier for planning entity extraction. |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
Bridges raw knowledge graph entities (person, technology, concept) into |
|
0981a08…
|
noreply
|
4 |
planning-ready structures (goals, requirements, decisions, risks). |
|
0981a08…
|
noreply
|
5 |
""" |
|
0981a08…
|
noreply
|
6 |
|
|
0981a08…
|
noreply
|
7 |
import logging |
|
0981a08…
|
noreply
|
8 |
from typing import Any, Dict, List, Optional |
|
0981a08…
|
noreply
|
9 |
|
|
0981a08…
|
noreply
|
10 |
from video_processor.models import PlanningEntity, PlanningEntityType |
|
0981a08…
|
noreply
|
11 |
|
|
0981a08…
|
noreply
|
12 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
13 |
|
|
0981a08…
|
noreply
|
14 |
# Keyword rules for heuristic classification. Each tuple is |
|
0981a08…
|
noreply
|
15 |
# (PlanningEntityType, list-of-keywords). Order matters — first match wins. |
|
0981a08…
|
noreply
|
16 |
_KEYWORD_RULES: List[tuple] = [ |
|
0981a08…
|
noreply
|
17 |
(PlanningEntityType.GOAL, ["goal", "objective", "aim", "target outcome"]), |
|
0981a08…
|
noreply
|
18 |
( |
|
0981a08…
|
noreply
|
19 |
PlanningEntityType.REQUIREMENT, |
|
0981a08…
|
noreply
|
20 |
["must", "should", "requirement", "need", "required"], |
|
0981a08…
|
noreply
|
21 |
), |
|
0981a08…
|
noreply
|
22 |
( |
|
0981a08…
|
noreply
|
23 |
PlanningEntityType.CONSTRAINT, |
|
0981a08…
|
noreply
|
24 |
["constraint", "limitation", "restrict", "cannot", "must not"], |
|
0981a08…
|
noreply
|
25 |
), |
|
0981a08…
|
noreply
|
26 |
( |
|
0981a08…
|
noreply
|
27 |
PlanningEntityType.DECISION, |
|
0981a08…
|
noreply
|
28 |
["decided", "decision", "chose", "selected", "agreed"], |
|
0981a08…
|
noreply
|
29 |
), |
|
0981a08…
|
noreply
|
30 |
(PlanningEntityType.RISK, ["risk", "concern", "worry", "danger", "threat"]), |
|
0981a08…
|
noreply
|
31 |
( |
|
0981a08…
|
noreply
|
32 |
PlanningEntityType.ASSUMPTION, |
|
0981a08…
|
noreply
|
33 |
["assume", "assumption", "expecting", "presume"], |
|
0981a08…
|
noreply
|
34 |
), |
|
0981a08…
|
noreply
|
35 |
( |
|
0981a08…
|
noreply
|
36 |
PlanningEntityType.DEPENDENCY, |
|
0981a08…
|
noreply
|
37 |
["depends", "dependency", "relies on", "prerequisite", "blocked"], |
|
0981a08…
|
noreply
|
38 |
), |
|
0981a08…
|
noreply
|
39 |
( |
|
0981a08…
|
noreply
|
40 |
PlanningEntityType.MILESTONE, |
|
0981a08…
|
noreply
|
41 |
["milestone", "deadline", "deliverable", "release", "launch"], |
|
0981a08…
|
noreply
|
42 |
), |
|
0981a08…
|
noreply
|
43 |
( |
|
0981a08…
|
noreply
|
44 |
PlanningEntityType.TASK, |
|
0981a08…
|
noreply
|
45 |
["task", "todo", "action item", "work item", "implement"], |
|
0981a08…
|
noreply
|
46 |
), |
|
0981a08…
|
noreply
|
47 |
(PlanningEntityType.FEATURE, ["feature", "capability", "functionality"]), |
|
0981a08…
|
noreply
|
48 |
] |
|
0981a08…
|
noreply
|
49 |
|
|
0981a08…
|
noreply
|
50 |
|
|
0981a08…
|
noreply
|
51 |
class TaxonomyClassifier: |
|
0981a08…
|
noreply
|
52 |
"""Classifies raw knowledge graph entities into planning taxonomy types.""" |
|
0981a08…
|
noreply
|
53 |
|
|
0981a08…
|
noreply
|
54 |
def __init__(self, provider_manager: Optional[Any] = None): |
|
0981a08…
|
noreply
|
55 |
self.pm = provider_manager |
|
0981a08…
|
noreply
|
56 |
|
|
0981a08…
|
noreply
|
57 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
58 |
# Public API |
|
0981a08…
|
noreply
|
59 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
60 |
|
|
0981a08…
|
noreply
|
61 |
def classify_entities( |
|
0981a08…
|
noreply
|
62 |
self, |
|
0981a08…
|
noreply
|
63 |
entities: List[Dict], |
|
0981a08…
|
noreply
|
64 |
relationships: List[Dict], |
|
0981a08…
|
noreply
|
65 |
) -> List[PlanningEntity]: |
|
0981a08…
|
noreply
|
66 |
"""Classify extracted entities into planning entity types. |
|
0981a08…
|
noreply
|
67 |
|
|
0981a08…
|
noreply
|
68 |
Uses heuristic classification first, then LLM refinement if a |
|
0981a08…
|
noreply
|
69 |
provider manager is available. |
|
0981a08…
|
noreply
|
70 |
""" |
|
0981a08…
|
noreply
|
71 |
planning_entities: List[PlanningEntity] = [] |
|
0981a08…
|
noreply
|
72 |
|
|
0981a08…
|
noreply
|
73 |
# Step 1: heuristic classification |
|
0981a08…
|
noreply
|
74 |
for entity in entities: |
|
0981a08…
|
noreply
|
75 |
planning_type = self._heuristic_classify(entity, relationships) |
|
0981a08…
|
noreply
|
76 |
if planning_type: |
|
0981a08…
|
noreply
|
77 |
descs = entity.get("descriptions", []) |
|
0981a08…
|
noreply
|
78 |
planning_entities.append( |
|
0981a08…
|
noreply
|
79 |
PlanningEntity( |
|
0981a08…
|
noreply
|
80 |
name=entity["name"], |
|
0981a08…
|
noreply
|
81 |
planning_type=planning_type, |
|
0981a08…
|
noreply
|
82 |
description="; ".join(descs[:2]), |
|
0981a08…
|
noreply
|
83 |
source_entities=[entity["name"]], |
|
0981a08…
|
noreply
|
84 |
) |
|
0981a08…
|
noreply
|
85 |
) |
|
0981a08…
|
noreply
|
86 |
|
|
0981a08…
|
noreply
|
87 |
# Step 2: LLM refinement (if provider available) |
|
0981a08…
|
noreply
|
88 |
if self.pm and entities: |
|
0981a08…
|
noreply
|
89 |
llm_classified = self._llm_classify(entities, relationships) |
|
0981a08…
|
noreply
|
90 |
planning_entities = self._merge_classifications(planning_entities, llm_classified) |
|
0981a08…
|
noreply
|
91 |
|
|
0981a08…
|
noreply
|
92 |
return planning_entities |
|
0981a08…
|
noreply
|
93 |
|
|
0981a08…
|
noreply
|
94 |
def organize_by_workstream( |
|
0981a08…
|
noreply
|
95 |
self, planning_entities: List[PlanningEntity] |
|
0981a08…
|
noreply
|
96 |
) -> Dict[str, List[PlanningEntity]]: |
|
0981a08…
|
noreply
|
97 |
"""Group planning entities into logical workstreams by type.""" |
|
0981a08…
|
noreply
|
98 |
workstreams: Dict[str, List[PlanningEntity]] = {} |
|
0981a08…
|
noreply
|
99 |
for pe in planning_entities: |
|
0981a08…
|
noreply
|
100 |
group = pe.planning_type.value + "s" |
|
0981a08…
|
noreply
|
101 |
workstreams.setdefault(group, []).append(pe) |
|
0981a08…
|
noreply
|
102 |
return workstreams |
|
0981a08…
|
noreply
|
103 |
|
|
0981a08…
|
noreply
|
104 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
105 |
# Heuristic classification |
|
0981a08…
|
noreply
|
106 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
107 |
|
|
0981a08…
|
noreply
|
108 |
def _heuristic_classify( |
|
0981a08…
|
noreply
|
109 |
self, |
|
0981a08…
|
noreply
|
110 |
entity: Dict, |
|
0981a08…
|
noreply
|
111 |
relationships: List[Dict], # noqa: ARG002 — reserved for future rules |
|
0981a08…
|
noreply
|
112 |
) -> Optional[PlanningEntityType]: |
|
0981a08…
|
noreply
|
113 |
"""Rule-based classification from entity type and description keywords.""" |
|
0981a08…
|
noreply
|
114 |
desc_lower = " ".join(entity.get("descriptions", [])).lower() |
|
0981a08…
|
noreply
|
115 |
|
|
0981a08…
|
noreply
|
116 |
for planning_type, keywords in _KEYWORD_RULES: |
|
0981a08…
|
noreply
|
117 |
if any(kw in desc_lower for kw in keywords): |
|
0981a08…
|
noreply
|
118 |
return planning_type |
|
0981a08…
|
noreply
|
119 |
|
|
0981a08…
|
noreply
|
120 |
return None |
|
0981a08…
|
noreply
|
121 |
|
|
0981a08…
|
noreply
|
122 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
123 |
# LLM classification |
|
0981a08…
|
noreply
|
124 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
125 |
|
|
0981a08…
|
noreply
|
126 |
def _llm_classify( |
|
0981a08…
|
noreply
|
127 |
self, entities: List[Dict], relationships: List[Dict] |
|
0981a08…
|
noreply
|
128 |
) -> List[PlanningEntity]: |
|
0981a08…
|
noreply
|
129 |
"""Use LLM to classify entities into planning types.""" |
|
0981a08…
|
noreply
|
130 |
entity_summaries = [] |
|
0981a08…
|
noreply
|
131 |
for e in entities[:50]: # limit to avoid token overflow |
|
0981a08…
|
noreply
|
132 |
descs = e.get("descriptions", []) |
|
0981a08…
|
noreply
|
133 |
desc_str = "; ".join(descs[:2]) if descs else "no description" |
|
0981a08…
|
noreply
|
134 |
entity_summaries.append(f"- {e['name']} ({e.get('type', 'concept')}): {desc_str}") |
|
0981a08…
|
noreply
|
135 |
|
|
0981a08…
|
noreply
|
136 |
prompt = ( |
|
0981a08…
|
noreply
|
137 |
"Classify these entities from a knowledge graph into planning categories.\n\n" |
|
0981a08…
|
noreply
|
138 |
"Entities:\n" + "\n".join(entity_summaries) + "\n\n" |
|
0981a08…
|
noreply
|
139 |
"Categories: goal, requirement, constraint, decision, risk, assumption, " |
|
0981a08…
|
noreply
|
140 |
"dependency, milestone, task, feature\n\n" |
|
0981a08…
|
noreply
|
141 |
"For each entity that fits a planning category, return JSON:\n" |
|
0981a08…
|
noreply
|
142 |
'[{"name": "...", "planning_type": "...", "priority": "high|medium|low"}]\n\n' |
|
0981a08…
|
noreply
|
143 |
"Only include entities that clearly fit a planning category. " |
|
0981a08…
|
noreply
|
144 |
"Skip entities that are just people, technologies, or general concepts. " |
|
0981a08…
|
noreply
|
145 |
"Return ONLY the JSON array." |
|
0981a08…
|
noreply
|
146 |
) |
|
0981a08…
|
noreply
|
147 |
|
|
0981a08…
|
noreply
|
148 |
try: |
|
0981a08…
|
noreply
|
149 |
raw = self.pm.chat( |
|
0981a08…
|
noreply
|
150 |
[{"role": "user", "content": prompt}], |
|
0981a08…
|
noreply
|
151 |
max_tokens=2048, |
|
0981a08…
|
noreply
|
152 |
temperature=0.2, |
|
0981a08…
|
noreply
|
153 |
) |
|
0981a08…
|
noreply
|
154 |
except Exception: |
|
0981a08…
|
noreply
|
155 |
logger.warning("LLM classification failed, using heuristic only") |
|
0981a08…
|
noreply
|
156 |
return [] |
|
0981a08…
|
noreply
|
157 |
|
|
0981a08…
|
noreply
|
158 |
from video_processor.utils.json_parsing import parse_json_from_response |
|
0981a08…
|
noreply
|
159 |
|
|
0981a08…
|
noreply
|
160 |
parsed = parse_json_from_response(raw) |
|
0981a08…
|
noreply
|
161 |
|
|
0981a08…
|
noreply
|
162 |
results: List[PlanningEntity] = [] |
|
0981a08…
|
noreply
|
163 |
if isinstance(parsed, list): |
|
0981a08…
|
noreply
|
164 |
for item in parsed: |
|
0981a08…
|
noreply
|
165 |
if isinstance(item, dict) and "name" in item and "planning_type" in item: |
|
0981a08…
|
noreply
|
166 |
try: |
|
0981a08…
|
noreply
|
167 |
ptype = PlanningEntityType(item["planning_type"]) |
|
0981a08…
|
noreply
|
168 |
results.append( |
|
0981a08…
|
noreply
|
169 |
PlanningEntity( |
|
0981a08…
|
noreply
|
170 |
name=item["name"], |
|
0981a08…
|
noreply
|
171 |
planning_type=ptype, |
|
0981a08…
|
noreply
|
172 |
priority=item.get("priority"), |
|
0981a08…
|
noreply
|
173 |
source_entities=[item["name"]], |
|
0981a08…
|
noreply
|
174 |
) |
|
0981a08…
|
noreply
|
175 |
) |
|
0981a08…
|
noreply
|
176 |
except ValueError: |
|
0981a08…
|
noreply
|
177 |
pass |
|
0981a08…
|
noreply
|
178 |
return results |
|
0981a08…
|
noreply
|
179 |
|
|
0981a08…
|
noreply
|
180 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
181 |
# Merge |
|
0981a08…
|
noreply
|
182 |
# ------------------------------------------------------------------ |
|
0981a08…
|
noreply
|
183 |
|
|
0981a08…
|
noreply
|
184 |
@staticmethod |
|
0981a08…
|
noreply
|
185 |
def _merge_classifications( |
|
0981a08…
|
noreply
|
186 |
heuristic: List[PlanningEntity], |
|
0981a08…
|
noreply
|
187 |
llm: List[PlanningEntity], |
|
0981a08…
|
noreply
|
188 |
) -> List[PlanningEntity]: |
|
0981a08…
|
noreply
|
189 |
"""Merge heuristic and LLM classifications. LLM wins on conflicts.""" |
|
0981a08…
|
noreply
|
190 |
by_name = {pe.name.lower(): pe for pe in heuristic} |
|
0981a08…
|
noreply
|
191 |
for pe in llm: |
|
0981a08…
|
noreply
|
192 |
by_name[pe.name.lower()] = pe # LLM overrides |
|
0981a08…
|
noreply
|
193 |
return list(by_name.values()) |