PlanOpticon
Merge pull request #47 from ConflictHQ/feat/falkordb-integration feat(graph): FalkorDB Lite integration as optional graph storage backend
Commit
0ad36b72af5ea5b93f8b49f830944c18a4d49324e909bd067878cad334abf046
Parent
d5108ce5a76dbde…
10 files changed
+2
+34
-60
+209
+2
-1
+3
-1
+355
+109
-121
+2
+2
+4
-3
~
pyproject.toml
~
tests/test_batch.py
~
tests/test_graph_store.py
~
video_processor/agent/orchestrator.py
~
video_processor/cli/commands.py
~
video_processor/integrators/graph_store.py
~
video_processor/integrators/knowledge_graph.py
~
video_processor/models.py
~
video_processor/output_structure.py
~
video_processor/pipeline.py
+2
| --- pyproject.toml | ||
| +++ pyproject.toml | ||
| @@ -54,10 +54,11 @@ | ||
| 54 | 54 | [project.optional-dependencies] |
| 55 | 55 | pdf = ["weasyprint>=60.0"] |
| 56 | 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | +graph = ["falkordblite>=0.4.0"] | |
| 59 | 60 | cloud = [ |
| 60 | 61 | "planopticon[gdrive]", |
| 61 | 62 | "planopticon[dropbox]", |
| 62 | 63 | ] |
| 63 | 64 | dev = [ |
| @@ -69,10 +70,11 @@ | ||
| 69 | 70 | "ruff>=0.1.0", |
| 70 | 71 | ] |
| 71 | 72 | all = [ |
| 72 | 73 | "planopticon[pdf]", |
| 73 | 74 | "planopticon[cloud]", |
| 75 | + "planopticon[graph]", | |
| 74 | 76 | "planopticon[dev]", |
| 75 | 77 | ] |
| 76 | 78 | |
| 77 | 79 | [project.urls] |
| 78 | 80 | Homepage = "https://planopticon.dev" |
| 79 | 81 |
| --- pyproject.toml | |
| +++ pyproject.toml | |
| @@ -54,10 +54,11 @@ | |
| 54 | [project.optional-dependencies] |
| 55 | pdf = ["weasyprint>=60.0"] |
| 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | cloud = [ |
| 60 | "planopticon[gdrive]", |
| 61 | "planopticon[dropbox]", |
| 62 | ] |
| 63 | dev = [ |
| @@ -69,10 +70,11 @@ | |
| 69 | "ruff>=0.1.0", |
| 70 | ] |
| 71 | all = [ |
| 72 | "planopticon[pdf]", |
| 73 | "planopticon[cloud]", |
| 74 | "planopticon[dev]", |
| 75 | ] |
| 76 | |
| 77 | [project.urls] |
| 78 | Homepage = "https://planopticon.dev" |
| 79 |
| --- pyproject.toml | |
| +++ pyproject.toml | |
| @@ -54,10 +54,11 @@ | |
| 54 | [project.optional-dependencies] |
| 55 | pdf = ["weasyprint>=60.0"] |
| 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | graph = ["falkordblite>=0.4.0"] |
| 60 | cloud = [ |
| 61 | "planopticon[gdrive]", |
| 62 | "planopticon[dropbox]", |
| 63 | ] |
| 64 | dev = [ |
| @@ -69,10 +70,11 @@ | |
| 70 | "ruff>=0.1.0", |
| 71 | ] |
| 72 | all = [ |
| 73 | "planopticon[pdf]", |
| 74 | "planopticon[cloud]", |
| 75 | "planopticon[graph]", |
| 76 | "planopticon[dev]", |
| 77 | ] |
| 78 | |
| 79 | [project.urls] |
| 80 | Homepage = "https://planopticon.dev" |
| 81 |
+34
-60
| --- tests/test_batch.py | ||
| +++ tests/test_batch.py | ||
| @@ -17,54 +17,44 @@ | ||
| 17 | 17 | create_batch_output_dirs, |
| 18 | 18 | read_batch_manifest, |
| 19 | 19 | write_batch_manifest, |
| 20 | 20 | ) |
| 21 | 21 | |
| 22 | + | |
| 23 | +def _make_kg_with_entity(name, entity_type="concept", descriptions=None, occurrences=None): | |
| 24 | + """Helper to build a KnowledgeGraph with entities via the store API.""" | |
| 25 | + kg = KnowledgeGraph() | |
| 26 | + descs = list(descriptions) if descriptions else [] | |
| 27 | + kg._store.merge_entity(name, entity_type, descs) | |
| 28 | + for occ in occurrences or []: | |
| 29 | + kg._store.add_occurrence(name, occ.get("source", ""), occ.get("timestamp"), occ.get("text")) | |
| 30 | + return kg | |
| 31 | + | |
| 22 | 32 | |
| 23 | 33 | class TestKnowledgeGraphMerge: |
| 24 | 34 | def test_merge_new_nodes(self): |
| 25 | 35 | kg1 = KnowledgeGraph() |
| 26 | - kg1.nodes["Python"] = { | |
| 27 | - "id": "Python", | |
| 28 | - "name": "Python", | |
| 29 | - "type": "concept", | |
| 30 | - "descriptions": {"A programming language"}, | |
| 31 | - "occurrences": [{"source": "video1"}], | |
| 32 | - } | |
| 36 | + kg1._store.merge_entity("Python", "concept", ["A programming language"]) | |
| 37 | + kg1._store.add_occurrence("Python", "video1") | |
| 33 | 38 | |
| 34 | 39 | kg2 = KnowledgeGraph() |
| 35 | - kg2.nodes["Rust"] = { | |
| 36 | - "id": "Rust", | |
| 37 | - "name": "Rust", | |
| 38 | - "type": "concept", | |
| 39 | - "descriptions": {"A systems language"}, | |
| 40 | - "occurrences": [{"source": "video2"}], | |
| 41 | - } | |
| 40 | + kg2._store.merge_entity("Rust", "concept", ["A systems language"]) | |
| 41 | + kg2._store.add_occurrence("Rust", "video2") | |
| 42 | 42 | |
| 43 | 43 | kg1.merge(kg2) |
| 44 | 44 | assert "Python" in kg1.nodes |
| 45 | 45 | assert "Rust" in kg1.nodes |
| 46 | 46 | assert len(kg1.nodes) == 2 |
| 47 | 47 | |
| 48 | 48 | def test_merge_overlapping_nodes_case_insensitive(self): |
| 49 | 49 | kg1 = KnowledgeGraph() |
| 50 | - kg1.nodes["Python"] = { | |
| 51 | - "id": "Python", | |
| 52 | - "name": "Python", | |
| 53 | - "type": "concept", | |
| 54 | - "descriptions": {"Language A"}, | |
| 55 | - "occurrences": [{"source": "v1"}], | |
| 56 | - } | |
| 50 | + kg1._store.merge_entity("Python", "concept", ["Language A"]) | |
| 51 | + kg1._store.add_occurrence("Python", "v1") | |
| 57 | 52 | |
| 58 | 53 | kg2 = KnowledgeGraph() |
| 59 | - kg2.nodes["python"] = { | |
| 60 | - "id": "python", | |
| 61 | - "name": "python", | |
| 62 | - "type": "concept", | |
| 63 | - "descriptions": {"Language B"}, | |
| 64 | - "occurrences": [{"source": "v2"}], | |
| 65 | - } | |
| 54 | + kg2._store.merge_entity("python", "concept", ["Language B"]) | |
| 55 | + kg2._store.add_occurrence("python", "v2") | |
| 66 | 56 | |
| 67 | 57 | kg1.merge(kg2) |
| 68 | 58 | # Should merge into existing node, not create duplicate |
| 69 | 59 | assert len(kg1.nodes) == 1 |
| 70 | 60 | assert "Python" in kg1.nodes |
| @@ -71,43 +61,38 @@ | ||
| 71 | 61 | assert len(kg1.nodes["Python"]["occurrences"]) == 2 |
| 72 | 62 | assert "Language B" in kg1.nodes["Python"]["descriptions"] |
| 73 | 63 | |
| 74 | 64 | def test_merge_relationships(self): |
| 75 | 65 | kg1 = KnowledgeGraph() |
| 76 | - kg1.relationships = [{"source": "A", "target": "B", "type": "uses"}] | |
| 66 | + kg1._store.merge_entity("A", "concept", []) | |
| 67 | + kg1._store.merge_entity("B", "concept", []) | |
| 68 | + kg1._store.add_relationship("A", "B", "uses") | |
| 77 | 69 | |
| 78 | 70 | kg2 = KnowledgeGraph() |
| 79 | - kg2.relationships = [{"source": "C", "target": "D", "type": "calls"}] | |
| 71 | + kg2._store.merge_entity("C", "concept", []) | |
| 72 | + kg2._store.merge_entity("D", "concept", []) | |
| 73 | + kg2._store.add_relationship("C", "D", "calls") | |
| 80 | 74 | |
| 81 | 75 | kg1.merge(kg2) |
| 82 | 76 | assert len(kg1.relationships) == 2 |
| 83 | 77 | |
| 84 | 78 | def test_merge_empty_into_populated(self): |
| 85 | 79 | kg1 = KnowledgeGraph() |
| 86 | - kg1.nodes["X"] = { | |
| 87 | - "id": "X", | |
| 88 | - "name": "X", | |
| 89 | - "type": "concept", | |
| 90 | - "descriptions": set(), | |
| 91 | - "occurrences": [], | |
| 92 | - } | |
| 80 | + kg1._store.merge_entity("X", "concept", []) | |
| 81 | + | |
| 93 | 82 | kg2 = KnowledgeGraph() |
| 94 | 83 | kg1.merge(kg2) |
| 95 | 84 | assert len(kg1.nodes) == 1 |
| 96 | 85 | |
| 97 | 86 | |
| 98 | 87 | class TestKnowledgeGraphFromDict: |
| 99 | 88 | def test_round_trip(self): |
| 100 | 89 | kg = KnowledgeGraph() |
| 101 | - kg.nodes["Alice"] = { | |
| 102 | - "id": "Alice", | |
| 103 | - "name": "Alice", | |
| 104 | - "type": "person", | |
| 105 | - "descriptions": {"Team lead"}, | |
| 106 | - "occurrences": [{"source": "transcript"}], | |
| 107 | - } | |
| 108 | - kg.relationships = [{"source": "Alice", "target": "Bob", "type": "manages"}] | |
| 90 | + kg._store.merge_entity("Alice", "person", ["Team lead"]) | |
| 91 | + kg._store.add_occurrence("Alice", "transcript") | |
| 92 | + kg._store.merge_entity("Bob", "person", []) | |
| 93 | + kg._store.add_relationship("Alice", "Bob", "manages") | |
| 109 | 94 | |
| 110 | 95 | data = kg.to_dict() |
| 111 | 96 | restored = KnowledgeGraph.from_dict(data) |
| 112 | 97 | assert "Alice" in restored.nodes |
| 113 | 98 | assert restored.nodes["Alice"]["type"] == "person" |
| @@ -137,17 +122,12 @@ | ||
| 137 | 122 | |
| 138 | 123 | |
| 139 | 124 | class TestKnowledgeGraphSave: |
| 140 | 125 | def test_save_as_pydantic(self, tmp_path): |
| 141 | 126 | kg = KnowledgeGraph() |
| 142 | - kg.nodes["Test"] = { | |
| 143 | - "id": "Test", | |
| 144 | - "name": "Test", | |
| 145 | - "type": "concept", | |
| 146 | - "descriptions": {"A test entity"}, | |
| 147 | - "occurrences": [], | |
| 148 | - } | |
| 127 | + kg._store.merge_entity("Test", "concept", ["A test entity"]) | |
| 128 | + | |
| 149 | 129 | path = kg.save(tmp_path / "kg.json") |
| 150 | 130 | assert path.exists() |
| 151 | 131 | data = json.loads(path.read_text()) |
| 152 | 132 | assert "nodes" in data |
| 153 | 133 | assert data["nodes"][0]["name"] == "Test" |
| @@ -225,20 +205,14 @@ | ||
| 225 | 205 | def test_batch_summary_with_kg(self, tmp_path): |
| 226 | 206 | manifests = [ |
| 227 | 207 | VideoManifest(video=VideoMetadata(title="V1")), |
| 228 | 208 | ] |
| 229 | 209 | kg = KnowledgeGraph() |
| 230 | - kg.nodes["Test"] = { | |
| 231 | - "id": "Test", | |
| 232 | - "name": "Test", | |
| 233 | - "type": "concept", | |
| 234 | - "descriptions": set(), | |
| 235 | - "occurrences": [], | |
| 236 | - } | |
| 237 | - kg.relationships = [{"source": "Test", "target": "Test", "type": "self"}] | |
| 210 | + kg._store.merge_entity("Test", "concept", []) | |
| 211 | + kg._store.add_relationship("Test", "Test", "self") | |
| 238 | 212 | |
| 239 | 213 | gen = PlanGenerator() |
| 240 | 214 | summary = gen.generate_batch_summary( |
| 241 | 215 | manifests=manifests, kg=kg, output_path=tmp_path / "s.md" |
| 242 | 216 | ) |
| 243 | 217 | assert "Knowledge Graph" in summary |
| 244 | 218 | assert "mermaid" in summary |
| 245 | 219 | |
| 246 | 220 | ADDED tests/test_graph_store.py |
| --- tests/test_batch.py | |
| +++ tests/test_batch.py | |
| @@ -17,54 +17,44 @@ | |
| 17 | create_batch_output_dirs, |
| 18 | read_batch_manifest, |
| 19 | write_batch_manifest, |
| 20 | ) |
| 21 | |
| 22 | |
| 23 | class TestKnowledgeGraphMerge: |
| 24 | def test_merge_new_nodes(self): |
| 25 | kg1 = KnowledgeGraph() |
| 26 | kg1.nodes["Python"] = { |
| 27 | "id": "Python", |
| 28 | "name": "Python", |
| 29 | "type": "concept", |
| 30 | "descriptions": {"A programming language"}, |
| 31 | "occurrences": [{"source": "video1"}], |
| 32 | } |
| 33 | |
| 34 | kg2 = KnowledgeGraph() |
| 35 | kg2.nodes["Rust"] = { |
| 36 | "id": "Rust", |
| 37 | "name": "Rust", |
| 38 | "type": "concept", |
| 39 | "descriptions": {"A systems language"}, |
| 40 | "occurrences": [{"source": "video2"}], |
| 41 | } |
| 42 | |
| 43 | kg1.merge(kg2) |
| 44 | assert "Python" in kg1.nodes |
| 45 | assert "Rust" in kg1.nodes |
| 46 | assert len(kg1.nodes) == 2 |
| 47 | |
| 48 | def test_merge_overlapping_nodes_case_insensitive(self): |
| 49 | kg1 = KnowledgeGraph() |
| 50 | kg1.nodes["Python"] = { |
| 51 | "id": "Python", |
| 52 | "name": "Python", |
| 53 | "type": "concept", |
| 54 | "descriptions": {"Language A"}, |
| 55 | "occurrences": [{"source": "v1"}], |
| 56 | } |
| 57 | |
| 58 | kg2 = KnowledgeGraph() |
| 59 | kg2.nodes["python"] = { |
| 60 | "id": "python", |
| 61 | "name": "python", |
| 62 | "type": "concept", |
| 63 | "descriptions": {"Language B"}, |
| 64 | "occurrences": [{"source": "v2"}], |
| 65 | } |
| 66 | |
| 67 | kg1.merge(kg2) |
| 68 | # Should merge into existing node, not create duplicate |
| 69 | assert len(kg1.nodes) == 1 |
| 70 | assert "Python" in kg1.nodes |
| @@ -71,43 +61,38 @@ | |
| 71 | assert len(kg1.nodes["Python"]["occurrences"]) == 2 |
| 72 | assert "Language B" in kg1.nodes["Python"]["descriptions"] |
| 73 | |
| 74 | def test_merge_relationships(self): |
| 75 | kg1 = KnowledgeGraph() |
| 76 | kg1.relationships = [{"source": "A", "target": "B", "type": "uses"}] |
| 77 | |
| 78 | kg2 = KnowledgeGraph() |
| 79 | kg2.relationships = [{"source": "C", "target": "D", "type": "calls"}] |
| 80 | |
| 81 | kg1.merge(kg2) |
| 82 | assert len(kg1.relationships) == 2 |
| 83 | |
| 84 | def test_merge_empty_into_populated(self): |
| 85 | kg1 = KnowledgeGraph() |
| 86 | kg1.nodes["X"] = { |
| 87 | "id": "X", |
| 88 | "name": "X", |
| 89 | "type": "concept", |
| 90 | "descriptions": set(), |
| 91 | "occurrences": [], |
| 92 | } |
| 93 | kg2 = KnowledgeGraph() |
| 94 | kg1.merge(kg2) |
| 95 | assert len(kg1.nodes) == 1 |
| 96 | |
| 97 | |
| 98 | class TestKnowledgeGraphFromDict: |
| 99 | def test_round_trip(self): |
| 100 | kg = KnowledgeGraph() |
| 101 | kg.nodes["Alice"] = { |
| 102 | "id": "Alice", |
| 103 | "name": "Alice", |
| 104 | "type": "person", |
| 105 | "descriptions": {"Team lead"}, |
| 106 | "occurrences": [{"source": "transcript"}], |
| 107 | } |
| 108 | kg.relationships = [{"source": "Alice", "target": "Bob", "type": "manages"}] |
| 109 | |
| 110 | data = kg.to_dict() |
| 111 | restored = KnowledgeGraph.from_dict(data) |
| 112 | assert "Alice" in restored.nodes |
| 113 | assert restored.nodes["Alice"]["type"] == "person" |
| @@ -137,17 +122,12 @@ | |
| 137 | |
| 138 | |
| 139 | class TestKnowledgeGraphSave: |
| 140 | def test_save_as_pydantic(self, tmp_path): |
| 141 | kg = KnowledgeGraph() |
| 142 | kg.nodes["Test"] = { |
| 143 | "id": "Test", |
| 144 | "name": "Test", |
| 145 | "type": "concept", |
| 146 | "descriptions": {"A test entity"}, |
| 147 | "occurrences": [], |
| 148 | } |
| 149 | path = kg.save(tmp_path / "kg.json") |
| 150 | assert path.exists() |
| 151 | data = json.loads(path.read_text()) |
| 152 | assert "nodes" in data |
| 153 | assert data["nodes"][0]["name"] == "Test" |
| @@ -225,20 +205,14 @@ | |
| 225 | def test_batch_summary_with_kg(self, tmp_path): |
| 226 | manifests = [ |
| 227 | VideoManifest(video=VideoMetadata(title="V1")), |
| 228 | ] |
| 229 | kg = KnowledgeGraph() |
| 230 | kg.nodes["Test"] = { |
| 231 | "id": "Test", |
| 232 | "name": "Test", |
| 233 | "type": "concept", |
| 234 | "descriptions": set(), |
| 235 | "occurrences": [], |
| 236 | } |
| 237 | kg.relationships = [{"source": "Test", "target": "Test", "type": "self"}] |
| 238 | |
| 239 | gen = PlanGenerator() |
| 240 | summary = gen.generate_batch_summary( |
| 241 | manifests=manifests, kg=kg, output_path=tmp_path / "s.md" |
| 242 | ) |
| 243 | assert "Knowledge Graph" in summary |
| 244 | assert "mermaid" in summary |
| 245 | |
| 246 | DDED tests/test_graph_store.py |
| --- tests/test_batch.py | |
| +++ tests/test_batch.py | |
| @@ -17,54 +17,44 @@ | |
| 17 | create_batch_output_dirs, |
| 18 | read_batch_manifest, |
| 19 | write_batch_manifest, |
| 20 | ) |
| 21 | |
| 22 | |
| 23 | def _make_kg_with_entity(name, entity_type="concept", descriptions=None, occurrences=None): |
| 24 | """Helper to build a KnowledgeGraph with entities via the store API.""" |
| 25 | kg = KnowledgeGraph() |
| 26 | descs = list(descriptions) if descriptions else [] |
| 27 | kg._store.merge_entity(name, entity_type, descs) |
| 28 | for occ in occurrences or []: |
| 29 | kg._store.add_occurrence(name, occ.get("source", ""), occ.get("timestamp"), occ.get("text")) |
| 30 | return kg |
| 31 | |
| 32 | |
| 33 | class TestKnowledgeGraphMerge: |
| 34 | def test_merge_new_nodes(self): |
| 35 | kg1 = KnowledgeGraph() |
| 36 | kg1._store.merge_entity("Python", "concept", ["A programming language"]) |
| 37 | kg1._store.add_occurrence("Python", "video1") |
| 38 | |
| 39 | kg2 = KnowledgeGraph() |
| 40 | kg2._store.merge_entity("Rust", "concept", ["A systems language"]) |
| 41 | kg2._store.add_occurrence("Rust", "video2") |
| 42 | |
| 43 | kg1.merge(kg2) |
| 44 | assert "Python" in kg1.nodes |
| 45 | assert "Rust" in kg1.nodes |
| 46 | assert len(kg1.nodes) == 2 |
| 47 | |
| 48 | def test_merge_overlapping_nodes_case_insensitive(self): |
| 49 | kg1 = KnowledgeGraph() |
| 50 | kg1._store.merge_entity("Python", "concept", ["Language A"]) |
| 51 | kg1._store.add_occurrence("Python", "v1") |
| 52 | |
| 53 | kg2 = KnowledgeGraph() |
| 54 | kg2._store.merge_entity("python", "concept", ["Language B"]) |
| 55 | kg2._store.add_occurrence("python", "v2") |
| 56 | |
| 57 | kg1.merge(kg2) |
| 58 | # Should merge into existing node, not create duplicate |
| 59 | assert len(kg1.nodes) == 1 |
| 60 | assert "Python" in kg1.nodes |
| @@ -71,43 +61,38 @@ | |
| 61 | assert len(kg1.nodes["Python"]["occurrences"]) == 2 |
| 62 | assert "Language B" in kg1.nodes["Python"]["descriptions"] |
| 63 | |
| 64 | def test_merge_relationships(self): |
| 65 | kg1 = KnowledgeGraph() |
| 66 | kg1._store.merge_entity("A", "concept", []) |
| 67 | kg1._store.merge_entity("B", "concept", []) |
| 68 | kg1._store.add_relationship("A", "B", "uses") |
| 69 | |
| 70 | kg2 = KnowledgeGraph() |
| 71 | kg2._store.merge_entity("C", "concept", []) |
| 72 | kg2._store.merge_entity("D", "concept", []) |
| 73 | kg2._store.add_relationship("C", "D", "calls") |
| 74 | |
| 75 | kg1.merge(kg2) |
| 76 | assert len(kg1.relationships) == 2 |
| 77 | |
| 78 | def test_merge_empty_into_populated(self): |
| 79 | kg1 = KnowledgeGraph() |
| 80 | kg1._store.merge_entity("X", "concept", []) |
| 81 | |
| 82 | kg2 = KnowledgeGraph() |
| 83 | kg1.merge(kg2) |
| 84 | assert len(kg1.nodes) == 1 |
| 85 | |
| 86 | |
| 87 | class TestKnowledgeGraphFromDict: |
| 88 | def test_round_trip(self): |
| 89 | kg = KnowledgeGraph() |
| 90 | kg._store.merge_entity("Alice", "person", ["Team lead"]) |
| 91 | kg._store.add_occurrence("Alice", "transcript") |
| 92 | kg._store.merge_entity("Bob", "person", []) |
| 93 | kg._store.add_relationship("Alice", "Bob", "manages") |
| 94 | |
| 95 | data = kg.to_dict() |
| 96 | restored = KnowledgeGraph.from_dict(data) |
| 97 | assert "Alice" in restored.nodes |
| 98 | assert restored.nodes["Alice"]["type"] == "person" |
| @@ -137,17 +122,12 @@ | |
| 122 | |
| 123 | |
| 124 | class TestKnowledgeGraphSave: |
| 125 | def test_save_as_pydantic(self, tmp_path): |
| 126 | kg = KnowledgeGraph() |
| 127 | kg._store.merge_entity("Test", "concept", ["A test entity"]) |
| 128 | |
| 129 | path = kg.save(tmp_path / "kg.json") |
| 130 | assert path.exists() |
| 131 | data = json.loads(path.read_text()) |
| 132 | assert "nodes" in data |
| 133 | assert data["nodes"][0]["name"] == "Test" |
| @@ -225,20 +205,14 @@ | |
| 205 | def test_batch_summary_with_kg(self, tmp_path): |
| 206 | manifests = [ |
| 207 | VideoManifest(video=VideoMetadata(title="V1")), |
| 208 | ] |
| 209 | kg = KnowledgeGraph() |
| 210 | kg._store.merge_entity("Test", "concept", []) |
| 211 | kg._store.add_relationship("Test", "Test", "self") |
| 212 | |
| 213 | gen = PlanGenerator() |
| 214 | summary = gen.generate_batch_summary( |
| 215 | manifests=manifests, kg=kg, output_path=tmp_path / "s.md" |
| 216 | ) |
| 217 | assert "Knowledge Graph" in summary |
| 218 | assert "mermaid" in summary |
| 219 | |
| 220 | DDED tests/test_graph_store.py |
+209
| --- a/tests/test_graph_store.py | ||
| +++ b/tests/test_graph_store.py | ||
| @@ -0,0 +1,209 @@ | ||
| 1 | +"""Tests for gimport pytest graph storage backends.""" | |
| 2 | + | |
| 3 | +from video_processor.integrators.graph_snMemoryStore, SQLiteStore, create_store | |
| 4 | + | |
| 5 | + | |
| 6 | +class TestInMemoryStore: | |
| 7 | + def test_merge_entity_creates_new(self): | |
| 8 | + store = InMemoryStore() | |
| 9 | + store.merge_entity("Python", "technology", ["A programming language"]) | |
| 10 | + assert store.get_entity_count() == 1 | |
| 11 | + entity = store.get_entity("python") | |
| 12 | + assert entity is not None | |
| 13 | + assert entity["name"] == "Python" | |
| 14 | + assert entity["type"] == "technology" | |
| 15 | + assert "A programming language" in entity["descriptions"] | |
| 16 | + | |
| 17 | + def test_merge_entity_case_insensitive_dedup(self): | |
| 18 | + store = InMemoryStore() | |
| 19 | + store.merge_entity("Python", "technology", ["Language"]) | |
| 20 | + store.merge_entity("python", "technology", ["Snake-based"]) | |
| 21 | + store.merge_entity("PYTHON", "technology", ["Popular"]) | |
| 22 | + assert store.get_entity_count() == 1 | |
| 23 | + entity = store.get_entity("Python") | |
| 24 | + assert entity is not None | |
| 25 | + assert "Language" in entity["descriptions"] | |
| 26 | + assert "Snake-based" in entity["descriptions"] | |
| 27 | + assert "Popular" in entity["descriptions"] | |
| 28 | + | |
| 29 | + def test_add_occurrence(self): | |
| 30 | + store = InMemoryStore() | |
| 31 | + store.merge_entity("Alice", "person", ["Engineer"]) | |
| 32 | + store.add_occurrence("Alice", "transcript_0", timestamp=10.5, text="Alice said...") | |
| 33 | + entity = store.get_entity("alice") | |
| 34 | + assert len(entity["occurrences"]) == 1 | |
| 35 | + assert entity["occurrences"][0]["source"] == "transcript_0" | |
| 36 | + assert entity["occurrences"][0]["timestamp"] == 10.5 | |
| 37 | + | |
| 38 | + def test_add_occurrence_nonexistent_entity(self): | |
| 39 | + store = InMemoryStore() | |
| 40 | + store.add_occurrence("Ghost", "transcript_0") | |
| 41 | + # Should not crash, just no-op | |
| 42 | + assert store.get_entity_count() == 0 | |
| 43 | + | |
| 44 | + def test_add_relationship(self): | |
| 45 | + store = InMemoryStore() | |
| 46 | + store.merge_entity("Alice", "person", []) | |
| 47 | + store.merge_entity("Bob", "person", []) | |
| 48 | + store.add_relationship("Alice", "Bob", "knows", content_source="t0", timestamp=5.0) | |
| 49 | + assert store.get_relationship_count() == 1 | |
| 50 | + rels = store.get_all_relationships() | |
| 51 | + assert rels[0]["source"] == "Alice" | |
| 52 | + assert rels[0]["target"] == "Bob" | |
| 53 | + assert rels[0]["type"] == "knows" | |
| 54 | + | |
| 55 | + def test_has_entity(self): | |
| 56 | + store = InMemoryStore() | |
| 57 | + assert not store.has_entity("Python") | |
| 58 | + store.merge_entity("Python", "technology", []) | |
| 59 | + assert store.has_entity("Python") | |
| 60 | + assert store.has_entity("python") | |
| 61 | + assert store.has_entity("PYTHON") | |
| 62 | + | |
| 63 | + def test_get_entity_not_found(self): | |
| 64 | + store = InMemoryStore() | |
| 65 | + assert store.get_entity("nonexistent") is None | |
| 66 | + | |
| 67 | + def test_get_all_entities(self): | |
| 68 | + store = InMemoryStore() | |
| 69 | + store.merge_entity("Alice", "person", ["Engineer"]) | |
| 70 | + store.merge_entity("Bob", "person", ["Manager"]) | |
| 71 | + entities = store.get_all_entities() | |
| 72 | + assert len(entities) == 2 | |
| 73 | + names = {e["name"] for e in entities} | |
| 74 | + assert names == {"Alice", "Bob"} | |
| 75 | + | |
| 76 | + def test_to_dict_format(self): | |
| 77 | + store = InMemoryStore() | |
| 78 | + store.merge_entity("Python", "technology", ["A language"]) | |
| 79 | + store.merge_entity("Django", "technology", ["A framework"]) | |
| 80 | + store.add_relationship("Django", "Python", "uses") | |
| 81 | + store.add_occurrence("Python", "transcript_0", timestamp=1.0, text="mentioned Python") | |
| 82 | + | |
| 83 | + data = store.to_dict() | |
| 84 | + assert "nodes" in data | |
| 85 | + assert "relationships" in data | |
| 86 | + assert len(data["nodes"]) == 2 | |
| 87 | + assert len(data["relationships"]) == 1 | |
| 88 | + | |
| 89 | + # Descriptions should be lists (not sets) | |
| 90 | + for node in data["nodes"]: | |
| 91 | + assert isinstance(node["descriptions"], list) | |
| 92 | + assert "id" in node | |
| 93 | + assert "name" in node | |
| 94 | + assert "type" in node | |
| 95 | + | |
| 96 | + def test_to_dict_roundtrip(self): | |
| 97 | + """Verify to_dict produces data that can reload into a new store.""" | |
| 98 | + store = InMemoryStore() | |
| 99 | + store.merge_entity("Alice", "person", ["Engineer"]) | |
| 100 | + store.merge_entity("Bob", "person", ["Manager"]) | |
| 101 | + store.add_relationship("Alice", "Bob", "reports_to") | |
| 102 | + store.add_occurrence("Alice", "src", timestamp=1.0, text="hello") | |
| 103 | + | |
| 104 | + data = store.to_dict() | |
| 105 | + | |
| 106 | + # Reload into a new store | |
| 107 | + store2 = InMemoryStore() | |
| 108 | + for node in data["nodes"]: | |
| 109 | + store2.merge_entity( | |
| 110 | + node["name"], node["type"], node["descriptions"], node.get("source") | |
| 111 | + ) | |
| 112 | + for occ in node.get("occurrences", []): | |
| 113 | + store2.add_occurrence( | |
| 114 | + node["name"], occ["source"], occ.get("timestamp"), occ.get("text") | |
| 115 | + ) | |
| 116 | + for rel in data["relationships"]: | |
| 117 | + store2.add_relationship( | |
| 118 | + rel["source"], | |
| 119 | + rel["target"], | |
| 120 | + rel["type"], | |
| 121 | + rel.get("content_source"), | |
| 122 | + rel.get("timestamp"), | |
| 123 | + ) | |
| 124 | + | |
| 125 | + assert store2.get_entity_count() == 2 | |
| 126 | + assert store2.get_relationship_count() == 1 | |
| 127 | + | |
| 128 | + def test_empty_store(self): | |
| 129 | + store = InMemoryStore() | |
| 130 | + assert store.get_entity_count() == 0 | |
| 131 | + assert store.get_relationship_count() == 0 | |
| 132 | + assert store.get_all_entities() == [] | |
| 133 | + assert store.get_all_relationships() == [] | |
| 134 | + data = store.to_dict() | |
| 135 | + assert data == {"nodes": [], "relationships": []} | |
| 136 | + | |
| 137 | + | |
| 138 | +class TestCreateStore: | |
| 139 | + def test_returns_in_memory_without_path(self): | |
| 140 | + store = create_store() | |
| 141 | + assert isinstance(store, InMemoryStore) | |
| 142 | + | |
| 143 | + def test_returns_in_memory_with_none_path(self): | |
| 144 | + store = create_store(db_path=None) | |
| 145 | + assert isinstance(store, Infallback_to_in_memor(self, tmp_path): | |
| 146 | + store = SQLiteStore(tmp_path / "test.db") | |
| 147 | + store.merge_entity("Python", "technology", []) | |
| 148 | + assert store.set_entity_properties("Python", {"version": "3.12", "stable": True}) | |
| 149 | + assert not store.set_entity_properties("Ghost", {"key": "val"}) | |
| 150 | + store.close() | |
| 151 | + | |
| 152 | + def test_has_relationship(self, tmp_path): | |
| 153 | + store = SQLiteStore(tmp_path / "test.db") | |
| 154 | + store.merge_entity("Alice", "person", []) | |
| 155 | + store.mfalkordbty("Bob", "person", []) | |
| 156 | + store.add_relationship("Alice", "Bob", "knows") | |
| 157 | + assert store.has_relationship("Alice", "Bob") | |
| 158 | + assert store.has_relationship("alice", "bob") | |
| 159 | + assert store.has_relationship("Alice", "Bob", "knows") | |
| 160 | + assert not store.has_relationship("Alice", "Bob", "hates") | |
| 161 | + assert not store.has_relationship("Bob", "Alice") | |
| 162 | + store.close() | |
| 163 | +"""When falkordblite is not installed, should fall back gracefullyet"] == "Bob" | |
| 164 | + tmp_path): | |
| 165 | + store# Will be FalkorDBStore if installed, InMemoryStore if not | |
| 166 | + # Either way, it should work | |
| 167 | + store.merge_entity("Test", "concept", ["test entity"]) | |
| 168 | + assert sto | |
| 169 | + | |
| 170 | +# Conditional FalkorDB tests | |
| 171 | +_falkordb_available = False | |
| 172 | +try: | |
| 173 | + import redislite # noqa: F401 | |
| 174 | + | |
| 175 | + _falkordb_available = True | |
| 176 | +except ImportError: | |
| 177 | + pass | |
| 178 | + | |
| 179 | + | |
| 180 | +@pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") | |
| 181 | +class TestFalkorDB.close() | |
| 182 | + | |
| 183 | + | |
| 184 | +class TestSQLiteStore: | |
| 185 | + def test_create_and_query_entitraph storage backends."""FalkorDBStore | |
| 186 | + | |
| 187 | + store = FalkorDBore = create_store(db_path=No store.merge_entity("Python", "technology", ["A language"]) | |
| 188 | + assert store.get_entity_count() == 1 | |
| 189 | + entity = store.get_entity("python") | |
| 190 | + assert entity is not None | |
| 191 | + assert entity["name"] == "Python" | |
| 192 | + store.close() | |
| 193 | + | |
| 194 | + def test_case_insensitive_mergraph storage backends."""FalkorDBStore | |
| 195 | + | |
| 196 | + store = FalkorDBore = create_store(db_path=No store.merge_entityLanguage"]) | |
| 197 | + store.merge_entity("python", "technology", ["Snake-based"]) | |
| 198 | + assert store.get_) | |
| 199 | + assert store.get_entity_count() == 1 | |
| 200 | + entity = store.get_entit"L assert "A programming language"lationship_countriptions"] | |
| 201 | + aInMemoryStore) | |
| 202 | + | |
| 203 | + def test store raph storage backends."""FalkorDBStore | |
| 204 | + | |
| 205 | + store = FalkorDBore = create_store(db_path=None) | |
| 206 | + assert isinstance(store, InMemoryS]) | |
| 207 | + store.merge_entity(" store.get_all_relationships() == [] | |
| 208 | + data = store.to_dict() | |
| 209 | + |
| --- a/tests/test_graph_store.py | |
| +++ b/tests/test_graph_store.py | |
| @@ -0,0 +1,209 @@ | |
| --- a/tests/test_graph_store.py | |
| +++ b/tests/test_graph_store.py | |
| @@ -0,0 +1,209 @@ | |
| 1 | """Tests for gimport pytest graph storage backends.""" |
| 2 | |
| 3 | from video_processor.integrators.graph_snMemoryStore, SQLiteStore, create_store |
| 4 | |
| 5 | |
| 6 | class TestInMemoryStore: |
| 7 | def test_merge_entity_creates_new(self): |
| 8 | store = InMemoryStore() |
| 9 | store.merge_entity("Python", "technology", ["A programming language"]) |
| 10 | assert store.get_entity_count() == 1 |
| 11 | entity = store.get_entity("python") |
| 12 | assert entity is not None |
| 13 | assert entity["name"] == "Python" |
| 14 | assert entity["type"] == "technology" |
| 15 | assert "A programming language" in entity["descriptions"] |
| 16 | |
| 17 | def test_merge_entity_case_insensitive_dedup(self): |
| 18 | store = InMemoryStore() |
| 19 | store.merge_entity("Python", "technology", ["Language"]) |
| 20 | store.merge_entity("python", "technology", ["Snake-based"]) |
| 21 | store.merge_entity("PYTHON", "technology", ["Popular"]) |
| 22 | assert store.get_entity_count() == 1 |
| 23 | entity = store.get_entity("Python") |
| 24 | assert entity is not None |
| 25 | assert "Language" in entity["descriptions"] |
| 26 | assert "Snake-based" in entity["descriptions"] |
| 27 | assert "Popular" in entity["descriptions"] |
| 28 | |
| 29 | def test_add_occurrence(self): |
| 30 | store = InMemoryStore() |
| 31 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 32 | store.add_occurrence("Alice", "transcript_0", timestamp=10.5, text="Alice said...") |
| 33 | entity = store.get_entity("alice") |
| 34 | assert len(entity["occurrences"]) == 1 |
| 35 | assert entity["occurrences"][0]["source"] == "transcript_0" |
| 36 | assert entity["occurrences"][0]["timestamp"] == 10.5 |
| 37 | |
| 38 | def test_add_occurrence_nonexistent_entity(self): |
| 39 | store = InMemoryStore() |
| 40 | store.add_occurrence("Ghost", "transcript_0") |
| 41 | # Should not crash, just no-op |
| 42 | assert store.get_entity_count() == 0 |
| 43 | |
| 44 | def test_add_relationship(self): |
| 45 | store = InMemoryStore() |
| 46 | store.merge_entity("Alice", "person", []) |
| 47 | store.merge_entity("Bob", "person", []) |
| 48 | store.add_relationship("Alice", "Bob", "knows", content_source="t0", timestamp=5.0) |
| 49 | assert store.get_relationship_count() == 1 |
| 50 | rels = store.get_all_relationships() |
| 51 | assert rels[0]["source"] == "Alice" |
| 52 | assert rels[0]["target"] == "Bob" |
| 53 | assert rels[0]["type"] == "knows" |
| 54 | |
| 55 | def test_has_entity(self): |
| 56 | store = InMemoryStore() |
| 57 | assert not store.has_entity("Python") |
| 58 | store.merge_entity("Python", "technology", []) |
| 59 | assert store.has_entity("Python") |
| 60 | assert store.has_entity("python") |
| 61 | assert store.has_entity("PYTHON") |
| 62 | |
| 63 | def test_get_entity_not_found(self): |
| 64 | store = InMemoryStore() |
| 65 | assert store.get_entity("nonexistent") is None |
| 66 | |
| 67 | def test_get_all_entities(self): |
| 68 | store = InMemoryStore() |
| 69 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 70 | store.merge_entity("Bob", "person", ["Manager"]) |
| 71 | entities = store.get_all_entities() |
| 72 | assert len(entities) == 2 |
| 73 | names = {e["name"] for e in entities} |
| 74 | assert names == {"Alice", "Bob"} |
| 75 | |
| 76 | def test_to_dict_format(self): |
| 77 | store = InMemoryStore() |
| 78 | store.merge_entity("Python", "technology", ["A language"]) |
| 79 | store.merge_entity("Django", "technology", ["A framework"]) |
| 80 | store.add_relationship("Django", "Python", "uses") |
| 81 | store.add_occurrence("Python", "transcript_0", timestamp=1.0, text="mentioned Python") |
| 82 | |
| 83 | data = store.to_dict() |
| 84 | assert "nodes" in data |
| 85 | assert "relationships" in data |
| 86 | assert len(data["nodes"]) == 2 |
| 87 | assert len(data["relationships"]) == 1 |
| 88 | |
| 89 | # Descriptions should be lists (not sets) |
| 90 | for node in data["nodes"]: |
| 91 | assert isinstance(node["descriptions"], list) |
| 92 | assert "id" in node |
| 93 | assert "name" in node |
| 94 | assert "type" in node |
| 95 | |
| 96 | def test_to_dict_roundtrip(self): |
| 97 | """Verify to_dict produces data that can reload into a new store.""" |
| 98 | store = InMemoryStore() |
| 99 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 100 | store.merge_entity("Bob", "person", ["Manager"]) |
| 101 | store.add_relationship("Alice", "Bob", "reports_to") |
| 102 | store.add_occurrence("Alice", "src", timestamp=1.0, text="hello") |
| 103 | |
| 104 | data = store.to_dict() |
| 105 | |
| 106 | # Reload into a new store |
| 107 | store2 = InMemoryStore() |
| 108 | for node in data["nodes"]: |
| 109 | store2.merge_entity( |
| 110 | node["name"], node["type"], node["descriptions"], node.get("source") |
| 111 | ) |
| 112 | for occ in node.get("occurrences", []): |
| 113 | store2.add_occurrence( |
| 114 | node["name"], occ["source"], occ.get("timestamp"), occ.get("text") |
| 115 | ) |
| 116 | for rel in data["relationships"]: |
| 117 | store2.add_relationship( |
| 118 | rel["source"], |
| 119 | rel["target"], |
| 120 | rel["type"], |
| 121 | rel.get("content_source"), |
| 122 | rel.get("timestamp"), |
| 123 | ) |
| 124 | |
| 125 | assert store2.get_entity_count() == 2 |
| 126 | assert store2.get_relationship_count() == 1 |
| 127 | |
| 128 | def test_empty_store(self): |
| 129 | store = InMemoryStore() |
| 130 | assert store.get_entity_count() == 0 |
| 131 | assert store.get_relationship_count() == 0 |
| 132 | assert store.get_all_entities() == [] |
| 133 | assert store.get_all_relationships() == [] |
| 134 | data = store.to_dict() |
| 135 | assert data == {"nodes": [], "relationships": []} |
| 136 | |
| 137 | |
| 138 | class TestCreateStore: |
| 139 | def test_returns_in_memory_without_path(self): |
| 140 | store = create_store() |
| 141 | assert isinstance(store, InMemoryStore) |
| 142 | |
| 143 | def test_returns_in_memory_with_none_path(self): |
| 144 | store = create_store(db_path=None) |
| 145 | assert isinstance(store, Infallback_to_in_memor(self, tmp_path): |
| 146 | store = SQLiteStore(tmp_path / "test.db") |
| 147 | store.merge_entity("Python", "technology", []) |
| 148 | assert store.set_entity_properties("Python", {"version": "3.12", "stable": True}) |
| 149 | assert not store.set_entity_properties("Ghost", {"key": "val"}) |
| 150 | store.close() |
| 151 | |
| 152 | def test_has_relationship(self, tmp_path): |
| 153 | store = SQLiteStore(tmp_path / "test.db") |
| 154 | store.merge_entity("Alice", "person", []) |
| 155 | store.mfalkordbty("Bob", "person", []) |
| 156 | store.add_relationship("Alice", "Bob", "knows") |
| 157 | assert store.has_relationship("Alice", "Bob") |
| 158 | assert store.has_relationship("alice", "bob") |
| 159 | assert store.has_relationship("Alice", "Bob", "knows") |
| 160 | assert not store.has_relationship("Alice", "Bob", "hates") |
| 161 | assert not store.has_relationship("Bob", "Alice") |
| 162 | store.close() |
| 163 | """When falkordblite is not installed, should fall back gracefullyet"] == "Bob" |
| 164 | tmp_path): |
| 165 | store# Will be FalkorDBStore if installed, InMemoryStore if not |
| 166 | # Either way, it should work |
| 167 | store.merge_entity("Test", "concept", ["test entity"]) |
| 168 | assert sto |
| 169 | |
| 170 | # Conditional FalkorDB tests |
| 171 | _falkordb_available = False |
| 172 | try: |
| 173 | import redislite # noqa: F401 |
| 174 | |
| 175 | _falkordb_available = True |
| 176 | except ImportError: |
| 177 | pass |
| 178 | |
| 179 | |
| 180 | @pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") |
| 181 | class TestFalkorDB.close() |
| 182 | |
| 183 | |
| 184 | class TestSQLiteStore: |
| 185 | def test_create_and_query_entitraph storage backends."""FalkorDBStore |
| 186 | |
| 187 | store = FalkorDBore = create_store(db_path=No store.merge_entity("Python", "technology", ["A language"]) |
| 188 | assert store.get_entity_count() == 1 |
| 189 | entity = store.get_entity("python") |
| 190 | assert entity is not None |
| 191 | assert entity["name"] == "Python" |
| 192 | store.close() |
| 193 | |
| 194 | def test_case_insensitive_mergraph storage backends."""FalkorDBStore |
| 195 | |
| 196 | store = FalkorDBore = create_store(db_path=No store.merge_entityLanguage"]) |
| 197 | store.merge_entity("python", "technology", ["Snake-based"]) |
| 198 | assert store.get_) |
| 199 | assert store.get_entity_count() == 1 |
| 200 | entity = store.get_entit"L assert "A programming language"lationship_countriptions"] |
| 201 | aInMemoryStore) |
| 202 | |
| 203 | def test store raph storage backends."""FalkorDBStore |
| 204 | |
| 205 | store = FalkorDBore = create_store(db_path=None) |
| 206 | assert isinstance(store, InMemoryS]) |
| 207 | store.merge_entity(" store.get_all_relationships() == [] |
| 208 | data = store.to_dict() |
| 209 |
| --- video_processor/agent/orchestrator.py | ||
| +++ video_processor/agent/orchestrator.py | ||
| @@ -191,11 +191,12 @@ | ||
| 191 | 191 | |
| 192 | 192 | elif step_name == "build_knowledge_graph": |
| 193 | 193 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 194 | 194 | |
| 195 | 195 | transcript = self._results.get("transcribe", {}) |
| 196 | - kg = KnowledgeGraph(provider_manager=self.pm) | |
| 196 | + kg_db_path = dirs["results"] / "knowledge_graph.db" | |
| 197 | + kg = KnowledgeGraph(provider_manager=self.pm, db_path=kg_db_path) | |
| 197 | 198 | kg.process_transcript(transcript) |
| 198 | 199 | |
| 199 | 200 | diagram_result = self._results.get("detect_diagrams", {}) |
| 200 | 201 | diagrams = diagram_result.get("diagrams", []) |
| 201 | 202 | if diagrams: |
| 202 | 203 |
| --- video_processor/agent/orchestrator.py | |
| +++ video_processor/agent/orchestrator.py | |
| @@ -191,11 +191,12 @@ | |
| 191 | |
| 192 | elif step_name == "build_knowledge_graph": |
| 193 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 194 | |
| 195 | transcript = self._results.get("transcribe", {}) |
| 196 | kg = KnowledgeGraph(provider_manager=self.pm) |
| 197 | kg.process_transcript(transcript) |
| 198 | |
| 199 | diagram_result = self._results.get("detect_diagrams", {}) |
| 200 | diagrams = diagram_result.get("diagrams", []) |
| 201 | if diagrams: |
| 202 |
| --- video_processor/agent/orchestrator.py | |
| +++ video_processor/agent/orchestrator.py | |
| @@ -191,11 +191,12 @@ | |
| 191 | |
| 192 | elif step_name == "build_knowledge_graph": |
| 193 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 194 | |
| 195 | transcript = self._results.get("transcribe", {}) |
| 196 | kg_db_path = dirs["results"] / "knowledge_graph.db" |
| 197 | kg = KnowledgeGraph(provider_manager=self.pm, db_path=kg_db_path) |
| 198 | kg.process_transcript(transcript) |
| 199 | |
| 200 | diagram_result = self._results.get("detect_diagrams", {}) |
| 201 | diagrams = diagram_result.get("diagrams", []) |
| 202 | if diagrams: |
| 203 |
+3
-1
| --- video_processor/cli/commands.py | ||
| +++ video_processor/cli/commands.py | ||
| @@ -254,11 +254,12 @@ | ||
| 254 | 254 | logging.info(f"Found {len(videos)} videos to process") |
| 255 | 255 | |
| 256 | 256 | dirs = create_batch_output_dirs(output, title) |
| 257 | 257 | manifests = [] |
| 258 | 258 | entries = [] |
| 259 | - merged_kg = KnowledgeGraph() | |
| 259 | + merged_kg_db = Path(output) / "knowledge_graph.db" | |
| 260 | + merged_kg = KnowledgeGraph(db_path=merged_kg_db) | |
| 260 | 261 | |
| 261 | 262 | for idx, video_path in enumerate(tqdm(videos, desc="Batch processing", unit="video")): |
| 262 | 263 | video_name = video_path.stem |
| 263 | 264 | video_output = dirs["videos"] / video_name |
| 264 | 265 | logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}") |
| @@ -325,10 +326,11 @@ | ||
| 325 | 326 | total_action_items=sum(e.action_items_count for e in entries), |
| 326 | 327 | total_key_points=sum(e.key_points_count for e in entries), |
| 327 | 328 | videos=entries, |
| 328 | 329 | batch_summary_md="batch_summary.md", |
| 329 | 330 | merged_knowledge_graph_json="knowledge_graph.json", |
| 331 | + merged_knowledge_graph_db="knowledge_graph.db", | |
| 330 | 332 | ) |
| 331 | 333 | write_batch_manifest(batch_manifest, output) |
| 332 | 334 | click.echo(pm.usage.format_summary()) |
| 333 | 335 | click.echo( |
| 334 | 336 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 335 | 337 | |
| 336 | 338 | ADDED video_processor/integrators/graph_store.py |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -254,11 +254,12 @@ | |
| 254 | logging.info(f"Found {len(videos)} videos to process") |
| 255 | |
| 256 | dirs = create_batch_output_dirs(output, title) |
| 257 | manifests = [] |
| 258 | entries = [] |
| 259 | merged_kg = KnowledgeGraph() |
| 260 | |
| 261 | for idx, video_path in enumerate(tqdm(videos, desc="Batch processing", unit="video")): |
| 262 | video_name = video_path.stem |
| 263 | video_output = dirs["videos"] / video_name |
| 264 | logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}") |
| @@ -325,10 +326,11 @@ | |
| 325 | total_action_items=sum(e.action_items_count for e in entries), |
| 326 | total_key_points=sum(e.key_points_count for e in entries), |
| 327 | videos=entries, |
| 328 | batch_summary_md="batch_summary.md", |
| 329 | merged_knowledge_graph_json="knowledge_graph.json", |
| 330 | ) |
| 331 | write_batch_manifest(batch_manifest, output) |
| 332 | click.echo(pm.usage.format_summary()) |
| 333 | click.echo( |
| 334 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 335 | |
| 336 | DDED video_processor/integrators/graph_store.py |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -254,11 +254,12 @@ | |
| 254 | logging.info(f"Found {len(videos)} videos to process") |
| 255 | |
| 256 | dirs = create_batch_output_dirs(output, title) |
| 257 | manifests = [] |
| 258 | entries = [] |
| 259 | merged_kg_db = Path(output) / "knowledge_graph.db" |
| 260 | merged_kg = KnowledgeGraph(db_path=merged_kg_db) |
| 261 | |
| 262 | for idx, video_path in enumerate(tqdm(videos, desc="Batch processing", unit="video")): |
| 263 | video_name = video_path.stem |
| 264 | video_output = dirs["videos"] / video_name |
| 265 | logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}") |
| @@ -325,10 +326,11 @@ | |
| 326 | total_action_items=sum(e.action_items_count for e in entries), |
| 327 | total_key_points=sum(e.key_points_count for e in entries), |
| 328 | videos=entries, |
| 329 | batch_summary_md="batch_summary.md", |
| 330 | merged_knowledge_graph_json="knowledge_graph.json", |
| 331 | merged_knowledge_graph_db="knowledge_graph.db", |
| 332 | ) |
| 333 | write_batch_manifest(batch_manifest, output) |
| 334 | click.echo(pm.usage.format_summary()) |
| 335 | click.echo( |
| 336 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 337 | |
| 338 | DDED video_processor/integrators/graph_store.py |
| --- a/video_processor/integrators/graph_store.py | ||
| +++ b/video_processor/integrators/graph_store.py | ||
| @@ -0,0 +1,355 @@ | ||
| 1 | +"""Graph storage backends for PlanOpticon knowledge graphs.""" | |
| 2 | + | |
| 3 | +import loggingport logging | |
| 4 | +import sqlite3 | |
| 5 | +from abc import ABC, abstractmethod | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Any, Dict, List, Optional, Union | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class GraphStore(ABC): | |
| 13 | + """Abstract interface for knowledge graph storage backends.""" | |
| 14 | + | |
| 15 | + @abstractmethod | |
| 16 | + def merge_entity( | |
| 17 | + self, | |
| 18 | + name: str, | |
| 19 | + entity_type: str, | |
| 20 | + descriptions: List[str], | |
| 21 | + source: Optional[str] = None, | |
| 22 | + ) -> None: | |
| 23 | + """Upsert an entity by case-insensitive name.""" | |
| 24 | + ... | |
| 25 | + | |
| 26 | + @abstractmethod | |
| 27 | + def add_occurrence( | |
| 28 | + self, | |
| 29 | + entity_name: str, | |
| 30 | + source: str, | |
| 31 | + timestamp: Optional[float] = None, | |
| 32 | + text: Optional[str] = None, | |
| 33 | + ) -> None: | |
| 34 | + """Add an occurrence record to an existing entity.""" | |
| 35 | + ... | |
| 36 | + | |
| 37 | + @abstractmethod | |
| 38 | + def add_relationship( | |
| 39 | + self, | |
| 40 | + source: str, | |
| 41 | + target: str, | |
| 42 | + rel_type: str, | |
| 43 | + content_source: Optional[str] = None, | |
| 44 | + timestamp: Optional[float] = None, | |
| 45 | + ) -> None: | |
| 46 | + """Add a relationship between two entities (both must already exist).""" | |
| 47 | + ... | |
| 48 | + | |
| 49 | + @abstractmethod | |
| 50 | + def get_entity: | |
| 51 | + """Add a relationship with a custom edge label (e.g. DEPENDS_OCypher for FalkorDBtionship which always uses RELATED_TO, this creates edges | |
| 52 | + with the specified label for richer graph semantics. | |
| 53 | + """ | |
| 54 | + ... | |
| 55 | + | |
| 56 | + @abstractmethod | |
| 57 | + def set_entity_properties( | |
| 58 | + self, | |
| 59 | + name: str, | |
| 60 | + properties: Dict[str, Any], | |
| 61 | + ) -> bool: | |
| 62 | + """Set arbitrary key/value properties on an existing entity. | |
| 63 | + | |
| 64 | + Returns True if the entity was found and updated, False otherwise. | |
| 65 | + """ | |
| 66 | + ... | |
| 67 | + | |
| 68 | + @abstractmethod | |
| 69 | + def has_relationship( | |
| 70 | + self, | |
| 71 | + source: str, | |
| 72 | + str, | |
| 73 | + properties: Dict[str, Any], | |
| 74 | + ) -> bool: | |
| 75 | + """Set arbitrary key/value properties on an existing entity. | |
| 76 | + | |
| 77 | + Returns True if the entity was found and updated, False otherwise. | |
| 78 | + """ | |
| 79 | + ... | |
| 80 | + | |
| 81 | + @abstractmethod | |
| 82 | + def has_relationship( | |
| 83 | + self, | |
| 84 | + source: str, | |
| 85 | + target: str, | |
| 86 | + edge_label: Optional[str] = None, | |
| 87 | + ) -> bool: | |
| 88 | + """Check if a relationship exists between two entities. | |
| 89 | + | |
| 90 | + If edge_label is None, checks for any relationship type. | |
| 91 | + """ | |
| 92 | + ... | |
| 93 | + | |
| 94 | + def register_source(self, source: Dict[str, Any]) -> None: | |
| 95 | + """Register a contenturn {"nodes": nodes, "relationships": self.get_all_relationships()} | |
| 96 | + | |
| 97 | + | |
| 98 | +class InMemoryStore(GraphStore): | |
| 99 | + """In-memory graph store using Python dicts. Default fallback.""" | |
| 100 | + | |
| 101 | + def __init__(self) -> None: | |
| 102 | + self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() | |
| 103 | + self._relationships: List[Dict[str, Any]] = [] | |
| 104 | + | |
| 105 | + def merge_entity( | |
| 106 | + self, | |
| 107 | + name: str, | |
| 108 | + entity_type: str, | |
| 109 | + descriptions: List[str], | |
| 110 | + source: Optional[str] = None, | |
| 111 | + ) -> None: | |
| 112 | + key = name.lower() | |
| 113 | + if key in self._nodes: | |
| 114 | + if descriptions: | |
| 115 | + self._nodes[key]["descriptions"].upraph storage bae_entity( | |
| 116 | + selRequires falkordblite package entities = self.get relationship_i# Patch redis 7.x compat: UnixDomainSocketConnection missing 'port'"" | |
| 117 | + | |
| 118 | + dimport redis.connection | |
| 119 | + | |
| 120 | + if not hasattr(redis.connection.UnixDomainSocdated, F redis.connection.UnixDomainfrom redislite import FalkorDB | |
| 121 | + None, | |
| 122 | + **kwargs, | |
| 123 | +db = FalkorDBity or relationship with locatgraph = self._dbself._ensure_indexesf.get_all_entities() | |
| 124 | + nodes = [] | |
| 125 | + for e in entities: | |
| 126 | + descs = e.get("descriptions", []) | |
| 127 | + if isinstance(descs, set): | |
| 128 | + descs = list(descs) | |
| 129 | + nodes.append( | |
| 130 | + { | |
| 131 | + "id": e.get("id", e["name"]), | |
| 132 | + "name": e["name"], | |
| 133 | + "type": e.get("type", "conc]:urce: Opt tryal, Union | |
| 134 | + | |
| 135 | +logger thod | |
| 136 | + def get_eip( | |
| 137 | + self, | |
| 138 | + source: str, | |
| 139 | + target: str, | |
| 140 | + rel_type: str, | |
| 141 | + content_source: Optional[str] = None, | |
| 142 | + timestamp: Optional[float] = None, | |
| 143 | + ) -> None: | |
| 144 | + """Add a relationship between two entities (both must already exist).""" | |
| 145 | + ... | |
| 146 | + | |
| 147 | + @abstractmethod | |
| 148 | + def get_entity(self, name: str) -> Optional[Dict[str, Any]]: | |
| 149 | + """Get an entity by case-insensitive name, or None.""" | |
| 150 | + ... | |
| 151 | + | |
| 152 | + @abstractmethod | |
| 153 | + def get_all_entities(self) -> List[Dict[str, Any]]: | |
| 154 | + """Return all entities as dicts.""" | |
| 155 | + ... | |
| 156 | + | |
| 157 | + @abstractmethod | |
| 158 | + def get_all_relationships(self) -> List[Dict[str, Any]]: | |
| 159 | + """Return all relationships as dicts.""" | |
| 160 | + ... | |
| 161 | + | |
| 162 | + @abstractmethod | |
| 163 | + def get_entity_count(self) -> int: ... | |
| 164 | + | |
| 165 | + @abstractmethod | |
| 166 | + def get_relationship_count(self) -> int: ... | |
| 167 | + | |
| 168 | + @abstractmethod | |
| 169 | + def has_entity(self, name: str) -> bool: | |
| 170 | + """Check if an entity exists (case-insensitive).""" | |
| 171 | + ... | |
| 172 | + | |
| 173 | + @abstractmethod | |
| 174 | + def add_typed_relationship( | |
| 175 | + self, | |
| 176 | + ch alfalkordb RELATED_TO, this creates edges | |
| 177 | + with the specified label for richer graph semantics. | |
| 178 | + """ | |
| 179 | + ... | |
| 180 | + | |
| 181 | + @abstractmethod | |
| 182 | + def set_entity_properties( | |
| 183 | + self, | |
| 184 | + name: str, | |
| 185 | + properties: Dict[str, Any], | |
| 186 | + ) -> bool: | |
| 187 | + """Set arbitrary key/value properties on an existing entity. | |
| 188 | + | |
| 189 | + Returns True if the entity was found and updated, False otherwise. | |
| 190 | + """ | |
| 191 | + ... | |
| 192 | + | |
| 193 | + @abstractmethod | |
| 194 | + def has_relationship( | |
| 195 | + self, | |
| 196 | + source: str, | |
| 197 | + target: str, | |
| 198 | + edge_label: Optional[str] = None, | |
| 199 | + ) -> bool: | |
| 200 | + """Check if a relationship exists between two entities. | |
| 201 | + | |
| 202 | + If edge_label is None, checks for any relationship type. | |
| 203 | + """ | |
| 204 | + ... | |
| 205 | + | |
| 206 | + def register_source(self, source: Dict[str, Any]) -> None: | |
| 207 | + """Register a content source. Default no-op for backends that don't support it.""" | |
| 208 | + pass | |
| 209 | + | |
| 210 | + def get_sources(self) -> List[Dict[str, Any]]: | |
| 211 | + """Return all registered sources.""" | |
| 212 | + return [] | |
| 213 | + | |
| 214 | + def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: | |
| 215 | + """Get a source by ID.""" | |
| 216 | + return None | |
| 217 | + | |
| 218 | + def add_source_location( | |
| 219 | + self, | |
| 220 | + source_id: str, | |
| 221 | + entity_name_lower: Optional[str] = None, | |
| 222 | + relationship_id: Optional[int] = None, | |
| 223 | + **kwargs, | |
| 224 | + ) -> None: | |
| 225 | + """Link a source to an entity or relationship with location details.""" | |
| 226 | + pass | |
| 227 | + | |
| 228 | + def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: | |
| 229 | + """Get all source locations for an entity.""" | |
| 230 | + return [] | |
| 231 | + | |
| 232 | + def raw_query(self, query_string: str) -> Any: | |
| 233 | + """Execute a raw query against the backend (e.g. SQL for SQLite). | |
| 234 | + | |
| 235 | + Not supported by all backends — raises NotImplementedError by default. | |
| 236 | + """ | |
| 237 | + raise NotImplementedError(f"{type(self).__name__} does not support raw queries") | |
| 238 | + | |
| 239 | + def to_dict(self) -> Dict[str, Any]: | |
| 240 | + """Export to JSON-compatible dict matching knowledge_graph.json format.""" | |
| 241 | + entities = self.get_all_entities() | |
| 242 | + nodes = [] | |
| 243 | + for e in entities: | |
| 244 | + descs = e.get("descriptions", []) | |
| 245 | + if isinstance(descs, set): | |
| 246 | + descs = list(descs) | |
| 247 | + nodes.append( | |
| 248 | + { | |
| 249 | + "id": e.get("id", e["name"]), | |
| 250 | + "name": e["name"], | |
| 251 | + "type": e.get("type", "concept"), | |
| 252 | + "descriptions": descs, | |
| 253 | + "occurrences": e.get("occurrences", []), | |
| 254 | + } | |
| 255 | + ) | |
| 256 | + result = {"nodes": nodes, "relationships": self.get_all_relationships()} | |
| 257 | + sources = self.get_sources() | |
| 258 | + if sources: | |
| 259 | + result["sources"] = sources | |
| 260 | + return result | |
| 261 | + | |
| 262 | + | |
| 263 | +class InMemoryStore(GraphStore): | |
| 264 | + """In-memory graph store using Python dicts. Default fallback.""" | |
| 265 | + | |
| 266 | + def __init__(self) -> None: | |
| 267 | + self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() | |
| 268 | + self._relationships: List[Dict[str, Any]] = [] | |
| 269 | + self._sources: Dict[str, Dict[str, Any]] = {} # keyed by source_id | |
| 270 | + self._source_locations: List[Dict[str, Any]] = [] | |
| 271 | + | |
| 272 | + def merge_entity( | |
| 273 | + self, | |
| 274 | + name: s backends for PlanOpticon knowledge graphs.""" | |
| 275 | + | |
| 276 | +import json | |
| 277 | +import logging | |
| 278 | +import sqlite3 | |
| 279 | +from abc import ABC, abstractmethod | |
| 280 | +from pathlib import Path | |
| 281 | +from typing import Any, Dict, List, Optional, Union | |
| 282 | + | |
| 283 | +logger = logging.getLogger(__name__) | |
| 284 | + | |
| 285 | + | |
| 286 | +class GraphStore(ABC): | |
| 287 | + """Abstract interface for knowledge graph storage backends.""" | |
| 288 | + | |
| 289 | + @abstractmethod | |
| 290 | + def merge_entity( | |
| 291 | + self, | |
| 292 | + name: str, | |
| 293 | + entity_type: str, | |
| 294 | + descriptions: List[str], | |
| 295 | + source: Optional[str] = None, | |
| 296 | + ) -> None: | |
| 297 | + """Upsert an entity by case-insensitive name.""" | |
| 298 | + ... | |
| 299 | + | |
| 300 | + @abstracfor query in [ | |
| 301 | + ABC, ab"CREATE INDEX FOR (e:Entit ABC, ab"CREATE INDEX FOR (e ABC, ab"CREATE INDEX FOR (e:Entity) ON ( self._graph.query(query) | |
| 302 | + | |
| 303 | + ) -> except Exception:pass # index already exists= list(descs) | |
| 304 | + nodes.append( | |
| 305 | + { | |
| 306 | + "id": e.get("id", e["name"]), | |
| 307 | + "name": e["name"], | |
| 308 | + name_lower = name.lower() | |
| 309 | + descs # Check if entity exists by defaulesult = self._graph.query( | |
| 310 | + " "MATCH (e:Entity {name_lower: $name_lower}) RETURN e.descriptions", | |
| 311 | + ck.""" | |
| 312 | + | |
| 313 | + params={"name_lowet(descs) | |
| 314 | +if result.result_set: | |
| 315 | + # Entity exists — merge d:RELATED_TOsting_descs = result.result_set[ional, Union | |
| 316 | + | |
| 317 | +logger = l"""Graph storage backends for PlanOce"H@3ql,4:"})"I@3IW,8:params={H@3ql,Y@19g,S: "name_lower": name_lowerI@3IW,e@1A9,P: "descs": descriptionsI@3IW,b@1Mt,3:}, | |
| 318 | +B@tl,1: 3c@2V0,J:self._graph.query( | |
| 319 | +9@2he,o: "MATCH (e:Entity {name_lower: $name_lower}) " | |
| 320 | + 8@r8,1H: "CREATE (o:Occurrence {source: $source, timestamp: $timestamp, text: $text}) " | |
| 321 | +C@3Gd,Y:"CREATE (e)-[:OCCURRED_IN]->(o)", | |
| 322 | +A@2TB,A: params={H@3ql,O:"name_lower": name_lowerJ@3IW,Z@1My,Y@1PM,L: "text": text, | |
| 323 | + 8@3B~,4: }, | |
| 324 | +3f@1IU,16:graph.ABC): | |
| 325 | + """Abstract interface for knowledge graph storage backends.""" | |
| 326 | + | |
| 327 | + @abstractmethod | |
| 328 | + def merge_entity( | |
| 329 | + self, | |
| 330 | + name: str, | |
| 331 | + entity_type: str, | |
| 332 | + descriptions: List[str], | |
| 333 | + source: Optional[str] = None, | |
| 334 | + ) -> None: | |
| 335 | + """Upsert an entity by case-insensitive name.""" | |
| 336 | + ... | |
| 337 | + | |
| 338 | + @abstractmethod | |
| 339 | + def add_occurrence( | |
| 340 | + self, | |
| 341 | + entity_name: str, | |
| 342 | + source: str, | |
| 343 | + timestamp: Optional[float] = None, | |
| 344 | + text: Optional[str] = None, | |
| 345 | + ) -> None: | |
| 346 | + """Add an occurrence record to an existing entity.""" | |
| 347 | + ... | |
| 348 | + | |
| 349 | + @abstractmethod | |
| 350 | + def add_relationship( | |
| 351 | + self, | |
| 352 | + source: str, | |
| 353 | + target: str, | |
| 354 | + rel_type: str, | |
| 355 | + content |
| --- a/video_processor/integrators/graph_store.py | |
| +++ b/video_processor/integrators/graph_store.py | |
| @@ -0,0 +1,355 @@ | |
| --- a/video_processor/integrators/graph_store.py | |
| +++ b/video_processor/integrators/graph_store.py | |
| @@ -0,0 +1,355 @@ | |
| 1 | """Graph storage backends for PlanOpticon knowledge graphs.""" |
| 2 | |
| 3 | import loggingport logging |
| 4 | import sqlite3 |
| 5 | from abc import ABC, abstractmethod |
| 6 | from pathlib import Path |
| 7 | from typing import Any, Dict, List, Optional, Union |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class GraphStore(ABC): |
| 13 | """Abstract interface for knowledge graph storage backends.""" |
| 14 | |
| 15 | @abstractmethod |
| 16 | def merge_entity( |
| 17 | self, |
| 18 | name: str, |
| 19 | entity_type: str, |
| 20 | descriptions: List[str], |
| 21 | source: Optional[str] = None, |
| 22 | ) -> None: |
| 23 | """Upsert an entity by case-insensitive name.""" |
| 24 | ... |
| 25 | |
| 26 | @abstractmethod |
| 27 | def add_occurrence( |
| 28 | self, |
| 29 | entity_name: str, |
| 30 | source: str, |
| 31 | timestamp: Optional[float] = None, |
| 32 | text: Optional[str] = None, |
| 33 | ) -> None: |
| 34 | """Add an occurrence record to an existing entity.""" |
| 35 | ... |
| 36 | |
| 37 | @abstractmethod |
| 38 | def add_relationship( |
| 39 | self, |
| 40 | source: str, |
| 41 | target: str, |
| 42 | rel_type: str, |
| 43 | content_source: Optional[str] = None, |
| 44 | timestamp: Optional[float] = None, |
| 45 | ) -> None: |
| 46 | """Add a relationship between two entities (both must already exist).""" |
| 47 | ... |
| 48 | |
| 49 | @abstractmethod |
| 50 | def get_entity: |
| 51 | """Add a relationship with a custom edge label (e.g. DEPENDS_OCypher for FalkorDBtionship which always uses RELATED_TO, this creates edges |
| 52 | with the specified label for richer graph semantics. |
| 53 | """ |
| 54 | ... |
| 55 | |
| 56 | @abstractmethod |
| 57 | def set_entity_properties( |
| 58 | self, |
| 59 | name: str, |
| 60 | properties: Dict[str, Any], |
| 61 | ) -> bool: |
| 62 | """Set arbitrary key/value properties on an existing entity. |
| 63 | |
| 64 | Returns True if the entity was found and updated, False otherwise. |
| 65 | """ |
| 66 | ... |
| 67 | |
| 68 | @abstractmethod |
| 69 | def has_relationship( |
| 70 | self, |
| 71 | source: str, |
| 72 | str, |
| 73 | properties: Dict[str, Any], |
| 74 | ) -> bool: |
| 75 | """Set arbitrary key/value properties on an existing entity. |
| 76 | |
| 77 | Returns True if the entity was found and updated, False otherwise. |
| 78 | """ |
| 79 | ... |
| 80 | |
| 81 | @abstractmethod |
| 82 | def has_relationship( |
| 83 | self, |
| 84 | source: str, |
| 85 | target: str, |
| 86 | edge_label: Optional[str] = None, |
| 87 | ) -> bool: |
| 88 | """Check if a relationship exists between two entities. |
| 89 | |
| 90 | If edge_label is None, checks for any relationship type. |
| 91 | """ |
| 92 | ... |
| 93 | |
| 94 | def register_source(self, source: Dict[str, Any]) -> None: |
| 95 | """Register a contenturn {"nodes": nodes, "relationships": self.get_all_relationships()} |
| 96 | |
| 97 | |
| 98 | class InMemoryStore(GraphStore): |
| 99 | """In-memory graph store using Python dicts. Default fallback.""" |
| 100 | |
| 101 | def __init__(self) -> None: |
| 102 | self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() |
| 103 | self._relationships: List[Dict[str, Any]] = [] |
| 104 | |
| 105 | def merge_entity( |
| 106 | self, |
| 107 | name: str, |
| 108 | entity_type: str, |
| 109 | descriptions: List[str], |
| 110 | source: Optional[str] = None, |
| 111 | ) -> None: |
| 112 | key = name.lower() |
| 113 | if key in self._nodes: |
| 114 | if descriptions: |
| 115 | self._nodes[key]["descriptions"].upraph storage bae_entity( |
| 116 | selRequires falkordblite package entities = self.get relationship_i# Patch redis 7.x compat: UnixDomainSocketConnection missing 'port'"" |
| 117 | |
| 118 | dimport redis.connection |
| 119 | |
| 120 | if not hasattr(redis.connection.UnixDomainSocdated, F redis.connection.UnixDomainfrom redislite import FalkorDB |
| 121 | None, |
| 122 | **kwargs, |
| 123 | db = FalkorDBity or relationship with locatgraph = self._dbself._ensure_indexesf.get_all_entities() |
| 124 | nodes = [] |
| 125 | for e in entities: |
| 126 | descs = e.get("descriptions", []) |
| 127 | if isinstance(descs, set): |
| 128 | descs = list(descs) |
| 129 | nodes.append( |
| 130 | { |
| 131 | "id": e.get("id", e["name"]), |
| 132 | "name": e["name"], |
| 133 | "type": e.get("type", "conc]:urce: Opt tryal, Union |
| 134 | |
| 135 | logger thod |
| 136 | def get_eip( |
| 137 | self, |
| 138 | source: str, |
| 139 | target: str, |
| 140 | rel_type: str, |
| 141 | content_source: Optional[str] = None, |
| 142 | timestamp: Optional[float] = None, |
| 143 | ) -> None: |
| 144 | """Add a relationship between two entities (both must already exist).""" |
| 145 | ... |
| 146 | |
| 147 | @abstractmethod |
| 148 | def get_entity(self, name: str) -> Optional[Dict[str, Any]]: |
| 149 | """Get an entity by case-insensitive name, or None.""" |
| 150 | ... |
| 151 | |
| 152 | @abstractmethod |
| 153 | def get_all_entities(self) -> List[Dict[str, Any]]: |
| 154 | """Return all entities as dicts.""" |
| 155 | ... |
| 156 | |
| 157 | @abstractmethod |
| 158 | def get_all_relationships(self) -> List[Dict[str, Any]]: |
| 159 | """Return all relationships as dicts.""" |
| 160 | ... |
| 161 | |
| 162 | @abstractmethod |
| 163 | def get_entity_count(self) -> int: ... |
| 164 | |
| 165 | @abstractmethod |
| 166 | def get_relationship_count(self) -> int: ... |
| 167 | |
| 168 | @abstractmethod |
| 169 | def has_entity(self, name: str) -> bool: |
| 170 | """Check if an entity exists (case-insensitive).""" |
| 171 | ... |
| 172 | |
| 173 | @abstractmethod |
| 174 | def add_typed_relationship( |
| 175 | self, |
| 176 | ch alfalkordb RELATED_TO, this creates edges |
| 177 | with the specified label for richer graph semantics. |
| 178 | """ |
| 179 | ... |
| 180 | |
| 181 | @abstractmethod |
| 182 | def set_entity_properties( |
| 183 | self, |
| 184 | name: str, |
| 185 | properties: Dict[str, Any], |
| 186 | ) -> bool: |
| 187 | """Set arbitrary key/value properties on an existing entity. |
| 188 | |
| 189 | Returns True if the entity was found and updated, False otherwise. |
| 190 | """ |
| 191 | ... |
| 192 | |
| 193 | @abstractmethod |
| 194 | def has_relationship( |
| 195 | self, |
| 196 | source: str, |
| 197 | target: str, |
| 198 | edge_label: Optional[str] = None, |
| 199 | ) -> bool: |
| 200 | """Check if a relationship exists between two entities. |
| 201 | |
| 202 | If edge_label is None, checks for any relationship type. |
| 203 | """ |
| 204 | ... |
| 205 | |
| 206 | def register_source(self, source: Dict[str, Any]) -> None: |
| 207 | """Register a content source. Default no-op for backends that don't support it.""" |
| 208 | pass |
| 209 | |
| 210 | def get_sources(self) -> List[Dict[str, Any]]: |
| 211 | """Return all registered sources.""" |
| 212 | return [] |
| 213 | |
| 214 | def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: |
| 215 | """Get a source by ID.""" |
| 216 | return None |
| 217 | |
| 218 | def add_source_location( |
| 219 | self, |
| 220 | source_id: str, |
| 221 | entity_name_lower: Optional[str] = None, |
| 222 | relationship_id: Optional[int] = None, |
| 223 | **kwargs, |
| 224 | ) -> None: |
| 225 | """Link a source to an entity or relationship with location details.""" |
| 226 | pass |
| 227 | |
| 228 | def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: |
| 229 | """Get all source locations for an entity.""" |
| 230 | return [] |
| 231 | |
| 232 | def raw_query(self, query_string: str) -> Any: |
| 233 | """Execute a raw query against the backend (e.g. SQL for SQLite). |
| 234 | |
| 235 | Not supported by all backends — raises NotImplementedError by default. |
| 236 | """ |
| 237 | raise NotImplementedError(f"{type(self).__name__} does not support raw queries") |
| 238 | |
| 239 | def to_dict(self) -> Dict[str, Any]: |
| 240 | """Export to JSON-compatible dict matching knowledge_graph.json format.""" |
| 241 | entities = self.get_all_entities() |
| 242 | nodes = [] |
| 243 | for e in entities: |
| 244 | descs = e.get("descriptions", []) |
| 245 | if isinstance(descs, set): |
| 246 | descs = list(descs) |
| 247 | nodes.append( |
| 248 | { |
| 249 | "id": e.get("id", e["name"]), |
| 250 | "name": e["name"], |
| 251 | "type": e.get("type", "concept"), |
| 252 | "descriptions": descs, |
| 253 | "occurrences": e.get("occurrences", []), |
| 254 | } |
| 255 | ) |
| 256 | result = {"nodes": nodes, "relationships": self.get_all_relationships()} |
| 257 | sources = self.get_sources() |
| 258 | if sources: |
| 259 | result["sources"] = sources |
| 260 | return result |
| 261 | |
| 262 | |
| 263 | class InMemoryStore(GraphStore): |
| 264 | """In-memory graph store using Python dicts. Default fallback.""" |
| 265 | |
| 266 | def __init__(self) -> None: |
| 267 | self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() |
| 268 | self._relationships: List[Dict[str, Any]] = [] |
| 269 | self._sources: Dict[str, Dict[str, Any]] = {} # keyed by source_id |
| 270 | self._source_locations: List[Dict[str, Any]] = [] |
| 271 | |
| 272 | def merge_entity( |
| 273 | self, |
| 274 | name: s backends for PlanOpticon knowledge graphs.""" |
| 275 | |
| 276 | import json |
| 277 | import logging |
| 278 | import sqlite3 |
| 279 | from abc import ABC, abstractmethod |
| 280 | from pathlib import Path |
| 281 | from typing import Any, Dict, List, Optional, Union |
| 282 | |
| 283 | logger = logging.getLogger(__name__) |
| 284 | |
| 285 | |
| 286 | class GraphStore(ABC): |
| 287 | """Abstract interface for knowledge graph storage backends.""" |
| 288 | |
| 289 | @abstractmethod |
| 290 | def merge_entity( |
| 291 | self, |
| 292 | name: str, |
| 293 | entity_type: str, |
| 294 | descriptions: List[str], |
| 295 | source: Optional[str] = None, |
| 296 | ) -> None: |
| 297 | """Upsert an entity by case-insensitive name.""" |
| 298 | ... |
| 299 | |
| 300 | @abstracfor query in [ |
| 301 | ABC, ab"CREATE INDEX FOR (e:Entit ABC, ab"CREATE INDEX FOR (e ABC, ab"CREATE INDEX FOR (e:Entity) ON ( self._graph.query(query) |
| 302 | |
| 303 | ) -> except Exception:pass # index already exists= list(descs) |
| 304 | nodes.append( |
| 305 | { |
| 306 | "id": e.get("id", e["name"]), |
| 307 | "name": e["name"], |
| 308 | name_lower = name.lower() |
| 309 | descs # Check if entity exists by defaulesult = self._graph.query( |
| 310 | " "MATCH (e:Entity {name_lower: $name_lower}) RETURN e.descriptions", |
| 311 | ck.""" |
| 312 | |
| 313 | params={"name_lowet(descs) |
| 314 | if result.result_set: |
| 315 | # Entity exists — merge d:RELATED_TOsting_descs = result.result_set[ional, Union |
| 316 | |
| 317 | logger = l"""Graph storage backends for PlanOce"H@3ql,4:"})"I@3IW,8:params={H@3ql,Y@19g,S: "name_lower": name_lowerI@3IW,e@1A9,P: "descs": descriptionsI@3IW,b@1Mt,3:}, |
| 318 | B@tl,1: 3c@2V0,J:self._graph.query( |
| 319 | 9@2he,o: "MATCH (e:Entity {name_lower: $name_lower}) " |
| 320 | 8@r8,1H: "CREATE (o:Occurrence {source: $source, timestamp: $timestamp, text: $text}) " |
| 321 | C@3Gd,Y:"CREATE (e)-[:OCCURRED_IN]->(o)", |
| 322 | A@2TB,A: params={H@3ql,O:"name_lower": name_lowerJ@3IW,Z@1My,Y@1PM,L: "text": text, |
| 323 | 8@3B~,4: }, |
| 324 | 3f@1IU,16:graph.ABC): |
| 325 | """Abstract interface for knowledge graph storage backends.""" |
| 326 | |
| 327 | @abstractmethod |
| 328 | def merge_entity( |
| 329 | self, |
| 330 | name: str, |
| 331 | entity_type: str, |
| 332 | descriptions: List[str], |
| 333 | source: Optional[str] = None, |
| 334 | ) -> None: |
| 335 | """Upsert an entity by case-insensitive name.""" |
| 336 | ... |
| 337 | |
| 338 | @abstractmethod |
| 339 | def add_occurrence( |
| 340 | self, |
| 341 | entity_name: str, |
| 342 | source: str, |
| 343 | timestamp: Optional[float] = None, |
| 344 | text: Optional[str] = None, |
| 345 | ) -> None: |
| 346 | """Add an occurrence record to an existing entity.""" |
| 347 | ... |
| 348 | |
| 349 | @abstractmethod |
| 350 | def add_relationship( |
| 351 | self, |
| 352 | source: str, |
| 353 | target: str, |
| 354 | rel_type: str, |
| 355 | content |
+109
-121
| --- video_processor/integrators/knowledge_graph.py | ||
| +++ video_processor/integrators/knowledge_graph.py | ||
| @@ -4,10 +4,11 @@ | ||
| 4 | 4 | from pathlib import Path |
| 5 | 5 | from typing import Dict, List, Optional, Union |
| 6 | 6 | |
| 7 | 7 | from tqdm import tqdm |
| 8 | 8 | |
| 9 | +from video_processor.integrators.graph_store import GraphStore, create_store | |
| 9 | 10 | from video_processor.models import Entity, KnowledgeGraphData, Relationship |
| 10 | 11 | from video_processor.providers.manager import ProviderManager |
| 11 | 12 | from video_processor.utils.json_parsing import parse_json_from_response |
| 12 | 13 | |
| 13 | 14 | logger = logging.getLogger(__name__) |
| @@ -17,14 +18,36 @@ | ||
| 17 | 18 | """Integrates extracted content into a structured knowledge graph.""" |
| 18 | 19 | |
| 19 | 20 | def __init__( |
| 20 | 21 | self, |
| 21 | 22 | provider_manager: Optional[ProviderManager] = None, |
| 23 | + db_path: Optional[Path] = None, | |
| 24 | + store: Optional[GraphStore] = None, | |
| 22 | 25 | ): |
| 23 | 26 | self.pm = provider_manager |
| 24 | - self.nodes: Dict[str, dict] = {} | |
| 25 | - self.relationships: List[dict] = [] | |
| 27 | + self._store = store or create_store(db_path) | |
| 28 | + | |
| 29 | + @property | |
| 30 | + def nodes(self) -> Dict[str, dict]: | |
| 31 | + """Backward-compatible read access to nodes as a dict keyed by entity name.""" | |
| 32 | + result = {} | |
| 33 | + for entity in self._store.get_all_entities(): | |
| 34 | + name = entity["name"] | |
| 35 | + descs = entity.get("descriptions", []) | |
| 36 | + result[name] = { | |
| 37 | + "id": entity.get("id", name), | |
| 38 | + "name": name, | |
| 39 | + "type": entity.get("type", "concept"), | |
| 40 | + "descriptions": set(descs) if isinstance(descs, list) else descs, | |
| 41 | + "occurrences": entity.get("occurrences", []), | |
| 42 | + } | |
| 43 | + return result | |
| 44 | + | |
| 45 | + @property | |
| 46 | + def relationships(self) -> List[dict]: | |
| 47 | + """Backward-compatible read access to relationships.""" | |
| 48 | + return self._store.get_all_relationships() | |
| 26 | 49 | |
| 27 | 50 | def _chat(self, prompt: str, temperature: float = 0.3) -> str: |
| 28 | 51 | """Send a chat message through ProviderManager (or return empty if none).""" |
| 29 | 52 | if not self.pm: |
| 30 | 53 | return "" |
| @@ -92,47 +115,24 @@ | ||
| 92 | 115 | |
| 93 | 116 | def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None: |
| 94 | 117 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 95 | 118 | entities, relationships = self.extract_entities_and_relationships(text) |
| 96 | 119 | |
| 120 | + snippet = text[:100] + "..." if len(text) > 100 else text | |
| 121 | + | |
| 97 | 122 | for entity in entities: |
| 98 | - eid = entity.name | |
| 99 | - if eid in self.nodes: | |
| 100 | - self.nodes[eid]["occurrences"].append( | |
| 101 | - { | |
| 102 | - "source": source, | |
| 103 | - "timestamp": timestamp, | |
| 104 | - "text": text[:100] + "..." if len(text) > 100 else text, | |
| 105 | - } | |
| 106 | - ) | |
| 107 | - if entity.descriptions: | |
| 108 | - self.nodes[eid]["descriptions"].update(entity.descriptions) | |
| 109 | - else: | |
| 110 | - self.nodes[eid] = { | |
| 111 | - "id": eid, | |
| 112 | - "name": entity.name, | |
| 113 | - "type": entity.type, | |
| 114 | - "descriptions": set(entity.descriptions), | |
| 115 | - "occurrences": [ | |
| 116 | - { | |
| 117 | - "source": source, | |
| 118 | - "timestamp": timestamp, | |
| 119 | - "text": text[:100] + "..." if len(text) > 100 else text, | |
| 120 | - } | |
| 121 | - ], | |
| 122 | - } | |
| 123 | + self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) | |
| 124 | + self._store.add_occurrence(entity.name, source, timestamp, snippet) | |
| 123 | 125 | |
| 124 | 126 | for rel in relationships: |
| 125 | - if rel.source in self.nodes and rel.target in self.nodes: | |
| 126 | - self.relationships.append( | |
| 127 | - { | |
| 128 | - "source": rel.source, | |
| 129 | - "target": rel.target, | |
| 130 | - "type": rel.type, | |
| 131 | - "content_source": source, | |
| 132 | - "timestamp": timestamp, | |
| 133 | - } | |
| 127 | + if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): | |
| 128 | + self._store.add_relationship( | |
| 129 | + rel.source, | |
| 130 | + rel.target, | |
| 131 | + rel.type, | |
| 132 | + content_source=source, | |
| 133 | + timestamp=timestamp, | |
| 134 | 134 | ) |
| 135 | 135 | |
| 136 | 136 | def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None: |
| 137 | 137 | """Process transcript segments into knowledge graph, batching for efficiency.""" |
| 138 | 138 | if "segments" not in transcript: |
| @@ -142,18 +142,12 @@ | ||
| 142 | 142 | segments = transcript["segments"] |
| 143 | 143 | |
| 144 | 144 | # Register speakers first |
| 145 | 145 | for i, segment in enumerate(segments): |
| 146 | 146 | speaker = segment.get("speaker", None) |
| 147 | - if speaker and speaker not in self.nodes: | |
| 148 | - self.nodes[speaker] = { | |
| 149 | - "id": speaker, | |
| 150 | - "name": speaker, | |
| 151 | - "type": "person", | |
| 152 | - "descriptions": {"Speaker in transcript"}, | |
| 153 | - "occurrences": [], | |
| 154 | - } | |
| 147 | + if speaker and not self._store.has_entity(speaker): | |
| 148 | + self._store.merge_entity(speaker, "person", ["Speaker in transcript"]) | |
| 155 | 149 | |
| 156 | 150 | # Batch segments together for fewer API calls |
| 157 | 151 | batches = [] |
| 158 | 152 | for start in range(0, len(segments), batch_size): |
| 159 | 153 | batches.append(segments[start : start + batch_size]) |
| @@ -173,42 +167,36 @@ | ||
| 173 | 167 | |
| 174 | 168 | def process_diagrams(self, diagrams: List[Dict]) -> None: |
| 175 | 169 | """Process diagram results into knowledge graph.""" |
| 176 | 170 | for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")): |
| 177 | 171 | text_content = diagram.get("text_content", "") |
| 172 | + source = f"diagram_{i}" | |
| 178 | 173 | if text_content: |
| 179 | - source = f"diagram_{i}" | |
| 180 | 174 | self.add_content(text_content, source) |
| 181 | 175 | |
| 182 | 176 | diagram_id = f"diagram_{i}" |
| 183 | - if diagram_id not in self.nodes: | |
| 184 | - self.nodes[diagram_id] = { | |
| 185 | - "id": diagram_id, | |
| 186 | - "name": f"Diagram {i}", | |
| 187 | - "type": "diagram", | |
| 188 | - "descriptions": {"Visual diagram from video"}, | |
| 189 | - "occurrences": [ | |
| 190 | - { | |
| 191 | - "source": source if text_content else f"diagram_{i}", | |
| 192 | - "frame_index": diagram.get("frame_index"), | |
| 193 | - } | |
| 194 | - ], | |
| 195 | - } | |
| 177 | + if not self._store.has_entity(diagram_id): | |
| 178 | + self._store.merge_entity(diagram_id, "diagram", ["Visual diagram from video"]) | |
| 179 | + self._store.add_occurrence( | |
| 180 | + diagram_id, | |
| 181 | + source if text_content else diagram_id, | |
| 182 | + text=f"frame_index={diagram.get('frame_index')}", | |
| 183 | + ) | |
| 196 | 184 | |
| 197 | 185 | def to_data(self) -> KnowledgeGraphData: |
| 198 | 186 | """Convert to pydantic KnowledgeGraphData model.""" |
| 199 | 187 | nodes = [] |
| 200 | - for node in self.nodes.values(): | |
| 201 | - descs = node.get("descriptions", set()) | |
| 188 | + for entity in self._store.get_all_entities(): | |
| 189 | + descs = entity.get("descriptions", []) | |
| 202 | 190 | if isinstance(descs, set): |
| 203 | 191 | descs = list(descs) |
| 204 | 192 | nodes.append( |
| 205 | 193 | Entity( |
| 206 | - name=node["name"], | |
| 207 | - type=node.get("type", "concept"), | |
| 194 | + name=entity["name"], | |
| 195 | + type=entity.get("type", "concept"), | |
| 208 | 196 | descriptions=descs, |
| 209 | - occurrences=node.get("occurrences", []), | |
| 197 | + occurrences=entity.get("occurrences", []), | |
| 210 | 198 | ) |
| 211 | 199 | ) |
| 212 | 200 | |
| 213 | 201 | rels = [ |
| 214 | 202 | Relationship( |
| @@ -216,24 +204,17 @@ | ||
| 216 | 204 | target=r["target"], |
| 217 | 205 | type=r.get("type", "related_to"), |
| 218 | 206 | content_source=r.get("content_source"), |
| 219 | 207 | timestamp=r.get("timestamp"), |
| 220 | 208 | ) |
| 221 | - for r in self.relationships | |
| 209 | + for r in self._store.get_all_relationships() | |
| 222 | 210 | ] |
| 223 | 211 | return KnowledgeGraphData(nodes=nodes, relationships=rels) |
| 224 | 212 | |
| 225 | 213 | def to_dict(self) -> Dict: |
| 226 | 214 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 227 | - nodes_json = [] | |
| 228 | - for node_id, node in self.nodes.items(): | |
| 229 | - node_json = node.copy() | |
| 230 | - descs = node.get("descriptions", set()) | |
| 231 | - node_json["descriptions"] = list(descs) if isinstance(descs, set) else descs | |
| 232 | - nodes_json.append(node_json) | |
| 233 | - | |
| 234 | - return {"nodes": nodes_json, "relationships": self.relationships} | |
| 215 | + return self._store.to_dict() | |
| 235 | 216 | |
| 236 | 217 | def save(self, output_path: Union[str, Path]) -> Path: |
| 237 | 218 | """Save knowledge graph to JSON file.""" |
| 238 | 219 | output_path = Path(output_path) |
| 239 | 220 | if not output_path.suffix: |
| @@ -241,89 +222,96 @@ | ||
| 241 | 222 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 242 | 223 | |
| 243 | 224 | data = self.to_data() |
| 244 | 225 | output_path.write_text(data.model_dump_json(indent=2)) |
| 245 | 226 | logger.info( |
| 246 | - f"Saved knowledge graph with {len(self.nodes)} nodes " | |
| 247 | - f"and {len(self.relationships)} relationships to {output_path}" | |
| 227 | + f"Saved knowledge graph with {self._store.get_entity_count()} nodes " | |
| 228 | + f"and {self._store.get_relationship_count()} relationships to {output_path}" | |
| 248 | 229 | ) |
| 249 | 230 | return output_path |
| 250 | 231 | |
| 251 | 232 | @classmethod |
| 252 | - def from_dict(cls, data: Dict) -> "KnowledgeGraph": | |
| 233 | + def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": | |
| 253 | 234 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 254 | - kg = cls() | |
| 235 | + kg = cls(db_path=db_path) | |
| 255 | 236 | for node in data.get("nodes", []): |
| 256 | - nid = node.get("id", node.get("name", "")) | |
| 237 | + name = node.get("name", node.get("id", "")) | |
| 257 | 238 | descs = node.get("descriptions", []) |
| 258 | - kg.nodes[nid] = { | |
| 259 | - "id": nid, | |
| 260 | - "name": node.get("name", nid), | |
| 261 | - "type": node.get("type", "concept"), | |
| 262 | - "descriptions": set(descs) if isinstance(descs, list) else descs, | |
| 263 | - "occurrences": node.get("occurrences", []), | |
| 264 | - } | |
| 265 | - kg.relationships = data.get("relationships", []) | |
| 239 | + if isinstance(descs, set): | |
| 240 | + descs = list(descs) | |
| 241 | + kg._store.merge_entity( | |
| 242 | + name, node.get("type", "concept"), descs, source=node.get("source") | |
| 243 | + ) | |
| 244 | + for occ in node.get("occurrences", []): | |
| 245 | + kg._store.add_occurrence( | |
| 246 | + name, | |
| 247 | + occ.get("source", ""), | |
| 248 | + occ.get("timestamp"), | |
| 249 | + occ.get("text"), | |
| 250 | + ) | |
| 251 | + for rel in data.get("relationships", []): | |
| 252 | + kg._store.add_relationship( | |
| 253 | + rel.get("source", ""), | |
| 254 | + rel.get("target", ""), | |
| 255 | + rel.get("type", "related_to"), | |
| 256 | + content_source=rel.get("content_source"), | |
| 257 | + timestamp=rel.get("timestamp"), | |
| 258 | + ) | |
| 266 | 259 | return kg |
| 267 | 260 | |
| 268 | 261 | def merge(self, other: "KnowledgeGraph") -> None: |
| 269 | 262 | """Merge another KnowledgeGraph into this one.""" |
| 270 | - for nid, node in other.nodes.items(): | |
| 271 | - nid_lower = nid.lower() | |
| 272 | - # Find existing node by case-insensitive match | |
| 273 | - existing_id = None | |
| 274 | - for eid in self.nodes: | |
| 275 | - if eid.lower() == nid_lower: | |
| 276 | - existing_id = eid | |
| 277 | - break | |
| 278 | - | |
| 279 | - if existing_id: | |
| 280 | - existing = self.nodes[existing_id] | |
| 281 | - existing["occurrences"].extend(node.get("occurrences", [])) | |
| 282 | - descs = node.get("descriptions", set()) | |
| 283 | - if isinstance(descs, set): | |
| 284 | - existing["descriptions"].update(descs) | |
| 285 | - elif isinstance(descs, list): | |
| 286 | - existing["descriptions"].update(descs) | |
| 287 | - else: | |
| 288 | - descs = node.get("descriptions", set()) | |
| 289 | - self.nodes[nid] = { | |
| 290 | - "id": nid, | |
| 291 | - "name": node.get("name", nid), | |
| 292 | - "type": node.get("type", "concept"), | |
| 293 | - "descriptions": set(descs) if isinstance(descs, list) else descs, | |
| 294 | - "occurrences": list(node.get("occurrences", [])), | |
| 295 | - } | |
| 296 | - | |
| 297 | - self.relationships.extend(other.relationships) | |
| 263 | + for entity in other._store.get_all_entities(): | |
| 264 | + name = entity["name"] | |
| 265 | + descs = entity.get("descriptions", []) | |
| 266 | + if isinstance(descs, set): | |
| 267 | + descs = list(descs) | |
| 268 | + self._store.merge_entity( | |
| 269 | + name, entity.get("type", "concept"), descs, source=entity.get("source") | |
| 270 | + ) | |
| 271 | + for occ in entity.get("occurrences", []): | |
| 272 | + self._store.add_occurrence( | |
| 273 | + name, | |
| 274 | + occ.get("source", ""), | |
| 275 | + occ.get("timestamp"), | |
| 276 | + occ.get("text"), | |
| 277 | + ) | |
| 278 | + | |
| 279 | + for rel in other._store.get_all_relationships(): | |
| 280 | + self._store.add_relationship( | |
| 281 | + rel.get("source", ""), | |
| 282 | + rel.get("target", ""), | |
| 283 | + rel.get("type", "related_to"), | |
| 284 | + content_source=rel.get("content_source"), | |
| 285 | + timestamp=rel.get("timestamp"), | |
| 286 | + ) | |
| 298 | 287 | |
| 299 | 288 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 300 | 289 | """Generate Mermaid visualization code.""" |
| 290 | + nodes = self.nodes | |
| 291 | + rels = self.relationships | |
| 292 | + | |
| 301 | 293 | node_importance = {} |
| 302 | - for node_id in self.nodes: | |
| 303 | - count = sum( | |
| 304 | - 1 | |
| 305 | - for rel in self.relationships | |
| 306 | - if rel["source"] == node_id or rel["target"] == node_id | |
| 307 | - ) | |
| 294 | + for node_id in nodes: | |
| 295 | + count = sum(1 for rel in rels if rel["source"] == node_id or rel["target"] == node_id) | |
| 308 | 296 | node_importance[node_id] = count |
| 309 | 297 | |
| 310 | 298 | important = sorted(node_importance.items(), key=lambda x: x[1], reverse=True) |
| 311 | 299 | important_ids = [n[0] for n in important[:max_nodes]] |
| 312 | 300 | |
| 313 | 301 | mermaid = ["graph LR"] |
| 314 | 302 | |
| 315 | 303 | for nid in important_ids: |
| 316 | - node = self.nodes[nid] | |
| 304 | + node = nodes[nid] | |
| 317 | 305 | ntype = node.get("type", "concept") |
| 318 | 306 | # Sanitize id for mermaid (alphanumeric + underscore only) |
| 319 | 307 | safe_id = "".join(c if c.isalnum() or c == "_" else "_" for c in nid) |
| 320 | 308 | safe_name = node["name"].replace('"', "'") |
| 321 | 309 | mermaid.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
| 322 | 310 | |
| 323 | 311 | added = set() |
| 324 | - for rel in self.relationships: | |
| 312 | + for rel in rels: | |
| 325 | 313 | src, tgt = rel["source"], rel["target"] |
| 326 | 314 | if src in important_ids and tgt in important_ids: |
| 327 | 315 | rtype = rel.get("type", "related_to") |
| 328 | 316 | key = f"{src}|{tgt}|{rtype}" |
| 329 | 317 | if key not in added: |
| 330 | 318 |
| --- video_processor/integrators/knowledge_graph.py | |
| +++ video_processor/integrators/knowledge_graph.py | |
| @@ -4,10 +4,11 @@ | |
| 4 | from pathlib import Path |
| 5 | from typing import Dict, List, Optional, Union |
| 6 | |
| 7 | from tqdm import tqdm |
| 8 | |
| 9 | from video_processor.models import Entity, KnowledgeGraphData, Relationship |
| 10 | from video_processor.providers.manager import ProviderManager |
| 11 | from video_processor.utils.json_parsing import parse_json_from_response |
| 12 | |
| 13 | logger = logging.getLogger(__name__) |
| @@ -17,14 +18,36 @@ | |
| 17 | """Integrates extracted content into a structured knowledge graph.""" |
| 18 | |
| 19 | def __init__( |
| 20 | self, |
| 21 | provider_manager: Optional[ProviderManager] = None, |
| 22 | ): |
| 23 | self.pm = provider_manager |
| 24 | self.nodes: Dict[str, dict] = {} |
| 25 | self.relationships: List[dict] = [] |
| 26 | |
| 27 | def _chat(self, prompt: str, temperature: float = 0.3) -> str: |
| 28 | """Send a chat message through ProviderManager (or return empty if none).""" |
| 29 | if not self.pm: |
| 30 | return "" |
| @@ -92,47 +115,24 @@ | |
| 92 | |
| 93 | def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None: |
| 94 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 95 | entities, relationships = self.extract_entities_and_relationships(text) |
| 96 | |
| 97 | for entity in entities: |
| 98 | eid = entity.name |
| 99 | if eid in self.nodes: |
| 100 | self.nodes[eid]["occurrences"].append( |
| 101 | { |
| 102 | "source": source, |
| 103 | "timestamp": timestamp, |
| 104 | "text": text[:100] + "..." if len(text) > 100 else text, |
| 105 | } |
| 106 | ) |
| 107 | if entity.descriptions: |
| 108 | self.nodes[eid]["descriptions"].update(entity.descriptions) |
| 109 | else: |
| 110 | self.nodes[eid] = { |
| 111 | "id": eid, |
| 112 | "name": entity.name, |
| 113 | "type": entity.type, |
| 114 | "descriptions": set(entity.descriptions), |
| 115 | "occurrences": [ |
| 116 | { |
| 117 | "source": source, |
| 118 | "timestamp": timestamp, |
| 119 | "text": text[:100] + "..." if len(text) > 100 else text, |
| 120 | } |
| 121 | ], |
| 122 | } |
| 123 | |
| 124 | for rel in relationships: |
| 125 | if rel.source in self.nodes and rel.target in self.nodes: |
| 126 | self.relationships.append( |
| 127 | { |
| 128 | "source": rel.source, |
| 129 | "target": rel.target, |
| 130 | "type": rel.type, |
| 131 | "content_source": source, |
| 132 | "timestamp": timestamp, |
| 133 | } |
| 134 | ) |
| 135 | |
| 136 | def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None: |
| 137 | """Process transcript segments into knowledge graph, batching for efficiency.""" |
| 138 | if "segments" not in transcript: |
| @@ -142,18 +142,12 @@ | |
| 142 | segments = transcript["segments"] |
| 143 | |
| 144 | # Register speakers first |
| 145 | for i, segment in enumerate(segments): |
| 146 | speaker = segment.get("speaker", None) |
| 147 | if speaker and speaker not in self.nodes: |
| 148 | self.nodes[speaker] = { |
| 149 | "id": speaker, |
| 150 | "name": speaker, |
| 151 | "type": "person", |
| 152 | "descriptions": {"Speaker in transcript"}, |
| 153 | "occurrences": [], |
| 154 | } |
| 155 | |
| 156 | # Batch segments together for fewer API calls |
| 157 | batches = [] |
| 158 | for start in range(0, len(segments), batch_size): |
| 159 | batches.append(segments[start : start + batch_size]) |
| @@ -173,42 +167,36 @@ | |
| 173 | |
| 174 | def process_diagrams(self, diagrams: List[Dict]) -> None: |
| 175 | """Process diagram results into knowledge graph.""" |
| 176 | for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")): |
| 177 | text_content = diagram.get("text_content", "") |
| 178 | if text_content: |
| 179 | source = f"diagram_{i}" |
| 180 | self.add_content(text_content, source) |
| 181 | |
| 182 | diagram_id = f"diagram_{i}" |
| 183 | if diagram_id not in self.nodes: |
| 184 | self.nodes[diagram_id] = { |
| 185 | "id": diagram_id, |
| 186 | "name": f"Diagram {i}", |
| 187 | "type": "diagram", |
| 188 | "descriptions": {"Visual diagram from video"}, |
| 189 | "occurrences": [ |
| 190 | { |
| 191 | "source": source if text_content else f"diagram_{i}", |
| 192 | "frame_index": diagram.get("frame_index"), |
| 193 | } |
| 194 | ], |
| 195 | } |
| 196 | |
| 197 | def to_data(self) -> KnowledgeGraphData: |
| 198 | """Convert to pydantic KnowledgeGraphData model.""" |
| 199 | nodes = [] |
| 200 | for node in self.nodes.values(): |
| 201 | descs = node.get("descriptions", set()) |
| 202 | if isinstance(descs, set): |
| 203 | descs = list(descs) |
| 204 | nodes.append( |
| 205 | Entity( |
| 206 | name=node["name"], |
| 207 | type=node.get("type", "concept"), |
| 208 | descriptions=descs, |
| 209 | occurrences=node.get("occurrences", []), |
| 210 | ) |
| 211 | ) |
| 212 | |
| 213 | rels = [ |
| 214 | Relationship( |
| @@ -216,24 +204,17 @@ | |
| 216 | target=r["target"], |
| 217 | type=r.get("type", "related_to"), |
| 218 | content_source=r.get("content_source"), |
| 219 | timestamp=r.get("timestamp"), |
| 220 | ) |
| 221 | for r in self.relationships |
| 222 | ] |
| 223 | return KnowledgeGraphData(nodes=nodes, relationships=rels) |
| 224 | |
| 225 | def to_dict(self) -> Dict: |
| 226 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 227 | nodes_json = [] |
| 228 | for node_id, node in self.nodes.items(): |
| 229 | node_json = node.copy() |
| 230 | descs = node.get("descriptions", set()) |
| 231 | node_json["descriptions"] = list(descs) if isinstance(descs, set) else descs |
| 232 | nodes_json.append(node_json) |
| 233 | |
| 234 | return {"nodes": nodes_json, "relationships": self.relationships} |
| 235 | |
| 236 | def save(self, output_path: Union[str, Path]) -> Path: |
| 237 | """Save knowledge graph to JSON file.""" |
| 238 | output_path = Path(output_path) |
| 239 | if not output_path.suffix: |
| @@ -241,89 +222,96 @@ | |
| 241 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 242 | |
| 243 | data = self.to_data() |
| 244 | output_path.write_text(data.model_dump_json(indent=2)) |
| 245 | logger.info( |
| 246 | f"Saved knowledge graph with {len(self.nodes)} nodes " |
| 247 | f"and {len(self.relationships)} relationships to {output_path}" |
| 248 | ) |
| 249 | return output_path |
| 250 | |
| 251 | @classmethod |
| 252 | def from_dict(cls, data: Dict) -> "KnowledgeGraph": |
| 253 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 254 | kg = cls() |
| 255 | for node in data.get("nodes", []): |
| 256 | nid = node.get("id", node.get("name", "")) |
| 257 | descs = node.get("descriptions", []) |
| 258 | kg.nodes[nid] = { |
| 259 | "id": nid, |
| 260 | "name": node.get("name", nid), |
| 261 | "type": node.get("type", "concept"), |
| 262 | "descriptions": set(descs) if isinstance(descs, list) else descs, |
| 263 | "occurrences": node.get("occurrences", []), |
| 264 | } |
| 265 | kg.relationships = data.get("relationships", []) |
| 266 | return kg |
| 267 | |
| 268 | def merge(self, other: "KnowledgeGraph") -> None: |
| 269 | """Merge another KnowledgeGraph into this one.""" |
| 270 | for nid, node in other.nodes.items(): |
| 271 | nid_lower = nid.lower() |
| 272 | # Find existing node by case-insensitive match |
| 273 | existing_id = None |
| 274 | for eid in self.nodes: |
| 275 | if eid.lower() == nid_lower: |
| 276 | existing_id = eid |
| 277 | break |
| 278 | |
| 279 | if existing_id: |
| 280 | existing = self.nodes[existing_id] |
| 281 | existing["occurrences"].extend(node.get("occurrences", [])) |
| 282 | descs = node.get("descriptions", set()) |
| 283 | if isinstance(descs, set): |
| 284 | existing["descriptions"].update(descs) |
| 285 | elif isinstance(descs, list): |
| 286 | existing["descriptions"].update(descs) |
| 287 | else: |
| 288 | descs = node.get("descriptions", set()) |
| 289 | self.nodes[nid] = { |
| 290 | "id": nid, |
| 291 | "name": node.get("name", nid), |
| 292 | "type": node.get("type", "concept"), |
| 293 | "descriptions": set(descs) if isinstance(descs, list) else descs, |
| 294 | "occurrences": list(node.get("occurrences", [])), |
| 295 | } |
| 296 | |
| 297 | self.relationships.extend(other.relationships) |
| 298 | |
| 299 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 300 | """Generate Mermaid visualization code.""" |
| 301 | node_importance = {} |
| 302 | for node_id in self.nodes: |
| 303 | count = sum( |
| 304 | 1 |
| 305 | for rel in self.relationships |
| 306 | if rel["source"] == node_id or rel["target"] == node_id |
| 307 | ) |
| 308 | node_importance[node_id] = count |
| 309 | |
| 310 | important = sorted(node_importance.items(), key=lambda x: x[1], reverse=True) |
| 311 | important_ids = [n[0] for n in important[:max_nodes]] |
| 312 | |
| 313 | mermaid = ["graph LR"] |
| 314 | |
| 315 | for nid in important_ids: |
| 316 | node = self.nodes[nid] |
| 317 | ntype = node.get("type", "concept") |
| 318 | # Sanitize id for mermaid (alphanumeric + underscore only) |
| 319 | safe_id = "".join(c if c.isalnum() or c == "_" else "_" for c in nid) |
| 320 | safe_name = node["name"].replace('"', "'") |
| 321 | mermaid.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
| 322 | |
| 323 | added = set() |
| 324 | for rel in self.relationships: |
| 325 | src, tgt = rel["source"], rel["target"] |
| 326 | if src in important_ids and tgt in important_ids: |
| 327 | rtype = rel.get("type", "related_to") |
| 328 | key = f"{src}|{tgt}|{rtype}" |
| 329 | if key not in added: |
| 330 |
| --- video_processor/integrators/knowledge_graph.py | |
| +++ video_processor/integrators/knowledge_graph.py | |
| @@ -4,10 +4,11 @@ | |
| 4 | from pathlib import Path |
| 5 | from typing import Dict, List, Optional, Union |
| 6 | |
| 7 | from tqdm import tqdm |
| 8 | |
| 9 | from video_processor.integrators.graph_store import GraphStore, create_store |
| 10 | from video_processor.models import Entity, KnowledgeGraphData, Relationship |
| 11 | from video_processor.providers.manager import ProviderManager |
| 12 | from video_processor.utils.json_parsing import parse_json_from_response |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| @@ -17,14 +18,36 @@ | |
| 18 | """Integrates extracted content into a structured knowledge graph.""" |
| 19 | |
| 20 | def __init__( |
| 21 | self, |
| 22 | provider_manager: Optional[ProviderManager] = None, |
| 23 | db_path: Optional[Path] = None, |
| 24 | store: Optional[GraphStore] = None, |
| 25 | ): |
| 26 | self.pm = provider_manager |
| 27 | self._store = store or create_store(db_path) |
| 28 | |
| 29 | @property |
| 30 | def nodes(self) -> Dict[str, dict]: |
| 31 | """Backward-compatible read access to nodes as a dict keyed by entity name.""" |
| 32 | result = {} |
| 33 | for entity in self._store.get_all_entities(): |
| 34 | name = entity["name"] |
| 35 | descs = entity.get("descriptions", []) |
| 36 | result[name] = { |
| 37 | "id": entity.get("id", name), |
| 38 | "name": name, |
| 39 | "type": entity.get("type", "concept"), |
| 40 | "descriptions": set(descs) if isinstance(descs, list) else descs, |
| 41 | "occurrences": entity.get("occurrences", []), |
| 42 | } |
| 43 | return result |
| 44 | |
| 45 | @property |
| 46 | def relationships(self) -> List[dict]: |
| 47 | """Backward-compatible read access to relationships.""" |
| 48 | return self._store.get_all_relationships() |
| 49 | |
| 50 | def _chat(self, prompt: str, temperature: float = 0.3) -> str: |
| 51 | """Send a chat message through ProviderManager (or return empty if none).""" |
| 52 | if not self.pm: |
| 53 | return "" |
| @@ -92,47 +115,24 @@ | |
| 115 | |
| 116 | def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None: |
| 117 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 118 | entities, relationships = self.extract_entities_and_relationships(text) |
| 119 | |
| 120 | snippet = text[:100] + "..." if len(text) > 100 else text |
| 121 | |
| 122 | for entity in entities: |
| 123 | self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) |
| 124 | self._store.add_occurrence(entity.name, source, timestamp, snippet) |
| 125 | |
| 126 | for rel in relationships: |
| 127 | if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): |
| 128 | self._store.add_relationship( |
| 129 | rel.source, |
| 130 | rel.target, |
| 131 | rel.type, |
| 132 | content_source=source, |
| 133 | timestamp=timestamp, |
| 134 | ) |
| 135 | |
| 136 | def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None: |
| 137 | """Process transcript segments into knowledge graph, batching for efficiency.""" |
| 138 | if "segments" not in transcript: |
| @@ -142,18 +142,12 @@ | |
| 142 | segments = transcript["segments"] |
| 143 | |
| 144 | # Register speakers first |
| 145 | for i, segment in enumerate(segments): |
| 146 | speaker = segment.get("speaker", None) |
| 147 | if speaker and not self._store.has_entity(speaker): |
| 148 | self._store.merge_entity(speaker, "person", ["Speaker in transcript"]) |
| 149 | |
| 150 | # Batch segments together for fewer API calls |
| 151 | batches = [] |
| 152 | for start in range(0, len(segments), batch_size): |
| 153 | batches.append(segments[start : start + batch_size]) |
| @@ -173,42 +167,36 @@ | |
| 167 | |
| 168 | def process_diagrams(self, diagrams: List[Dict]) -> None: |
| 169 | """Process diagram results into knowledge graph.""" |
| 170 | for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")): |
| 171 | text_content = diagram.get("text_content", "") |
| 172 | source = f"diagram_{i}" |
| 173 | if text_content: |
| 174 | self.add_content(text_content, source) |
| 175 | |
| 176 | diagram_id = f"diagram_{i}" |
| 177 | if not self._store.has_entity(diagram_id): |
| 178 | self._store.merge_entity(diagram_id, "diagram", ["Visual diagram from video"]) |
| 179 | self._store.add_occurrence( |
| 180 | diagram_id, |
| 181 | source if text_content else diagram_id, |
| 182 | text=f"frame_index={diagram.get('frame_index')}", |
| 183 | ) |
| 184 | |
| 185 | def to_data(self) -> KnowledgeGraphData: |
| 186 | """Convert to pydantic KnowledgeGraphData model.""" |
| 187 | nodes = [] |
| 188 | for entity in self._store.get_all_entities(): |
| 189 | descs = entity.get("descriptions", []) |
| 190 | if isinstance(descs, set): |
| 191 | descs = list(descs) |
| 192 | nodes.append( |
| 193 | Entity( |
| 194 | name=entity["name"], |
| 195 | type=entity.get("type", "concept"), |
| 196 | descriptions=descs, |
| 197 | occurrences=entity.get("occurrences", []), |
| 198 | ) |
| 199 | ) |
| 200 | |
| 201 | rels = [ |
| 202 | Relationship( |
| @@ -216,24 +204,17 @@ | |
| 204 | target=r["target"], |
| 205 | type=r.get("type", "related_to"), |
| 206 | content_source=r.get("content_source"), |
| 207 | timestamp=r.get("timestamp"), |
| 208 | ) |
| 209 | for r in self._store.get_all_relationships() |
| 210 | ] |
| 211 | return KnowledgeGraphData(nodes=nodes, relationships=rels) |
| 212 | |
| 213 | def to_dict(self) -> Dict: |
| 214 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 215 | return self._store.to_dict() |
| 216 | |
| 217 | def save(self, output_path: Union[str, Path]) -> Path: |
| 218 | """Save knowledge graph to JSON file.""" |
| 219 | output_path = Path(output_path) |
| 220 | if not output_path.suffix: |
| @@ -241,89 +222,96 @@ | |
| 222 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 223 | |
| 224 | data = self.to_data() |
| 225 | output_path.write_text(data.model_dump_json(indent=2)) |
| 226 | logger.info( |
| 227 | f"Saved knowledge graph with {self._store.get_entity_count()} nodes " |
| 228 | f"and {self._store.get_relationship_count()} relationships to {output_path}" |
| 229 | ) |
| 230 | return output_path |
| 231 | |
| 232 | @classmethod |
| 233 | def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": |
| 234 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 235 | kg = cls(db_path=db_path) |
| 236 | for node in data.get("nodes", []): |
| 237 | name = node.get("name", node.get("id", "")) |
| 238 | descs = node.get("descriptions", []) |
| 239 | if isinstance(descs, set): |
| 240 | descs = list(descs) |
| 241 | kg._store.merge_entity( |
| 242 | name, node.get("type", "concept"), descs, source=node.get("source") |
| 243 | ) |
| 244 | for occ in node.get("occurrences", []): |
| 245 | kg._store.add_occurrence( |
| 246 | name, |
| 247 | occ.get("source", ""), |
| 248 | occ.get("timestamp"), |
| 249 | occ.get("text"), |
| 250 | ) |
| 251 | for rel in data.get("relationships", []): |
| 252 | kg._store.add_relationship( |
| 253 | rel.get("source", ""), |
| 254 | rel.get("target", ""), |
| 255 | rel.get("type", "related_to"), |
| 256 | content_source=rel.get("content_source"), |
| 257 | timestamp=rel.get("timestamp"), |
| 258 | ) |
| 259 | return kg |
| 260 | |
| 261 | def merge(self, other: "KnowledgeGraph") -> None: |
| 262 | """Merge another KnowledgeGraph into this one.""" |
| 263 | for entity in other._store.get_all_entities(): |
| 264 | name = entity["name"] |
| 265 | descs = entity.get("descriptions", []) |
| 266 | if isinstance(descs, set): |
| 267 | descs = list(descs) |
| 268 | self._store.merge_entity( |
| 269 | name, entity.get("type", "concept"), descs, source=entity.get("source") |
| 270 | ) |
| 271 | for occ in entity.get("occurrences", []): |
| 272 | self._store.add_occurrence( |
| 273 | name, |
| 274 | occ.get("source", ""), |
| 275 | occ.get("timestamp"), |
| 276 | occ.get("text"), |
| 277 | ) |
| 278 | |
| 279 | for rel in other._store.get_all_relationships(): |
| 280 | self._store.add_relationship( |
| 281 | rel.get("source", ""), |
| 282 | rel.get("target", ""), |
| 283 | rel.get("type", "related_to"), |
| 284 | content_source=rel.get("content_source"), |
| 285 | timestamp=rel.get("timestamp"), |
| 286 | ) |
| 287 | |
| 288 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 289 | """Generate Mermaid visualization code.""" |
| 290 | nodes = self.nodes |
| 291 | rels = self.relationships |
| 292 | |
| 293 | node_importance = {} |
| 294 | for node_id in nodes: |
| 295 | count = sum(1 for rel in rels if rel["source"] == node_id or rel["target"] == node_id) |
| 296 | node_importance[node_id] = count |
| 297 | |
| 298 | important = sorted(node_importance.items(), key=lambda x: x[1], reverse=True) |
| 299 | important_ids = [n[0] for n in important[:max_nodes]] |
| 300 | |
| 301 | mermaid = ["graph LR"] |
| 302 | |
| 303 | for nid in important_ids: |
| 304 | node = nodes[nid] |
| 305 | ntype = node.get("type", "concept") |
| 306 | # Sanitize id for mermaid (alphanumeric + underscore only) |
| 307 | safe_id = "".join(c if c.isalnum() or c == "_" else "_" for c in nid) |
| 308 | safe_name = node["name"].replace('"', "'") |
| 309 | mermaid.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
| 310 | |
| 311 | added = set() |
| 312 | for rel in rels: |
| 313 | src, tgt = rel["source"], rel["target"] |
| 314 | if src in important_ids and tgt in important_ids: |
| 315 | rtype = rel.get("type", "related_to") |
| 316 | key = f"{src}|{tgt}|{rtype}" |
| 317 | if key not in added: |
| 318 |
| --- video_processor/models.py | ||
| +++ video_processor/models.py | ||
| @@ -176,10 +176,11 @@ | ||
| 176 | 176 | transcript_srt: Optional[str] = Field(default=None) |
| 177 | 177 | analysis_md: Optional[str] = Field(default=None) |
| 178 | 178 | analysis_html: Optional[str] = Field(default=None) |
| 179 | 179 | analysis_pdf: Optional[str] = Field(default=None) |
| 180 | 180 | knowledge_graph_json: Optional[str] = Field(default=None) |
| 181 | + knowledge_graph_db: Optional[str] = Field(default=None) | |
| 181 | 182 | key_points_json: Optional[str] = Field(default=None) |
| 182 | 183 | action_items_json: Optional[str] = Field(default=None) |
| 183 | 184 | |
| 184 | 185 | # Inline structured data |
| 185 | 186 | key_points: List[KeyPoint] = Field(default_factory=list) |
| @@ -225,5 +226,6 @@ | ||
| 225 | 226 | total_key_points: int = Field(default=0) |
| 226 | 227 | |
| 227 | 228 | # Batch-level output paths (relative) |
| 228 | 229 | batch_summary_md: Optional[str] = Field(default=None) |
| 229 | 230 | merged_knowledge_graph_json: Optional[str] = Field(default=None) |
| 231 | + merged_knowledge_graph_db: Optional[str] = Field(default=None) | |
| 230 | 232 |
| --- video_processor/models.py | |
| +++ video_processor/models.py | |
| @@ -176,10 +176,11 @@ | |
| 176 | transcript_srt: Optional[str] = Field(default=None) |
| 177 | analysis_md: Optional[str] = Field(default=None) |
| 178 | analysis_html: Optional[str] = Field(default=None) |
| 179 | analysis_pdf: Optional[str] = Field(default=None) |
| 180 | knowledge_graph_json: Optional[str] = Field(default=None) |
| 181 | key_points_json: Optional[str] = Field(default=None) |
| 182 | action_items_json: Optional[str] = Field(default=None) |
| 183 | |
| 184 | # Inline structured data |
| 185 | key_points: List[KeyPoint] = Field(default_factory=list) |
| @@ -225,5 +226,6 @@ | |
| 225 | total_key_points: int = Field(default=0) |
| 226 | |
| 227 | # Batch-level output paths (relative) |
| 228 | batch_summary_md: Optional[str] = Field(default=None) |
| 229 | merged_knowledge_graph_json: Optional[str] = Field(default=None) |
| 230 |
| --- video_processor/models.py | |
| +++ video_processor/models.py | |
| @@ -176,10 +176,11 @@ | |
| 176 | transcript_srt: Optional[str] = Field(default=None) |
| 177 | analysis_md: Optional[str] = Field(default=None) |
| 178 | analysis_html: Optional[str] = Field(default=None) |
| 179 | analysis_pdf: Optional[str] = Field(default=None) |
| 180 | knowledge_graph_json: Optional[str] = Field(default=None) |
| 181 | knowledge_graph_db: Optional[str] = Field(default=None) |
| 182 | key_points_json: Optional[str] = Field(default=None) |
| 183 | action_items_json: Optional[str] = Field(default=None) |
| 184 | |
| 185 | # Inline structured data |
| 186 | key_points: List[KeyPoint] = Field(default_factory=list) |
| @@ -225,5 +226,6 @@ | |
| 226 | total_key_points: int = Field(default=0) |
| 227 | |
| 228 | # Batch-level output paths (relative) |
| 229 | batch_summary_md: Optional[str] = Field(default=None) |
| 230 | merged_knowledge_graph_json: Optional[str] = Field(default=None) |
| 231 | merged_knowledge_graph_db: Optional[str] = Field(default=None) |
| 232 |
| --- video_processor/output_structure.py | ||
| +++ video_processor/output_structure.py | ||
| @@ -25,10 +25,11 @@ | ||
| 25 | 25 | captures/ |
| 26 | 26 | capture_0.jpg, capture_0.json |
| 27 | 27 | results/ |
| 28 | 28 | analysis.md, .html, .pdf |
| 29 | 29 | knowledge_graph.json |
| 30 | + knowledge_graph.db (when falkordblite installed) | |
| 30 | 31 | key_points.json |
| 31 | 32 | action_items.json |
| 32 | 33 | cache/ |
| 33 | 34 | |
| 34 | 35 | Returns dict mapping directory names to Path objects. |
| @@ -56,10 +57,11 @@ | ||
| 56 | 57 | Layout: |
| 57 | 58 | output_dir/ |
| 58 | 59 | manifest.json |
| 59 | 60 | batch_summary.md |
| 60 | 61 | knowledge_graph.json |
| 62 | + knowledge_graph.db (when falkordblite installed) | |
| 61 | 63 | videos/ |
| 62 | 64 | video_1/manifest.json |
| 63 | 65 | video_2/manifest.json |
| 64 | 66 | ... |
| 65 | 67 | |
| 66 | 68 |
| --- video_processor/output_structure.py | |
| +++ video_processor/output_structure.py | |
| @@ -25,10 +25,11 @@ | |
| 25 | captures/ |
| 26 | capture_0.jpg, capture_0.json |
| 27 | results/ |
| 28 | analysis.md, .html, .pdf |
| 29 | knowledge_graph.json |
| 30 | key_points.json |
| 31 | action_items.json |
| 32 | cache/ |
| 33 | |
| 34 | Returns dict mapping directory names to Path objects. |
| @@ -56,10 +57,11 @@ | |
| 56 | Layout: |
| 57 | output_dir/ |
| 58 | manifest.json |
| 59 | batch_summary.md |
| 60 | knowledge_graph.json |
| 61 | videos/ |
| 62 | video_1/manifest.json |
| 63 | video_2/manifest.json |
| 64 | ... |
| 65 | |
| 66 |
| --- video_processor/output_structure.py | |
| +++ video_processor/output_structure.py | |
| @@ -25,10 +25,11 @@ | |
| 25 | captures/ |
| 26 | capture_0.jpg, capture_0.json |
| 27 | results/ |
| 28 | analysis.md, .html, .pdf |
| 29 | knowledge_graph.json |
| 30 | knowledge_graph.db (when falkordblite installed) |
| 31 | key_points.json |
| 32 | action_items.json |
| 33 | cache/ |
| 34 | |
| 35 | Returns dict mapping directory names to Path objects. |
| @@ -56,10 +57,11 @@ | |
| 57 | Layout: |
| 58 | output_dir/ |
| 59 | manifest.json |
| 60 | batch_summary.md |
| 61 | knowledge_graph.json |
| 62 | knowledge_graph.db (when falkordblite installed) |
| 63 | videos/ |
| 64 | video_1/manifest.json |
| 65 | video_2/manifest.json |
| 66 | ... |
| 67 | |
| 68 |
+4
-3
| --- video_processor/pipeline.py | ||
| +++ video_processor/pipeline.py | ||
| @@ -191,18 +191,18 @@ | ||
| 191 | 191 | |
| 192 | 192 | # --- Step 5: Knowledge graph --- |
| 193 | 193 | pm.usage.start_step("Knowledge graph") |
| 194 | 194 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 195 | 195 | kg_json_path = dirs["results"] / "knowledge_graph.json" |
| 196 | + kg_db_path = dirs["results"] / "knowledge_graph.db" | |
| 196 | 197 | if kg_json_path.exists(): |
| 197 | 198 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 198 | 199 | kg_data = json.loads(kg_json_path.read_text()) |
| 199 | - kg = KnowledgeGraph(provider_manager=pm) | |
| 200 | - kg = KnowledgeGraph.from_dict(kg_data) | |
| 200 | + kg = KnowledgeGraph.from_dict(kg_data, db_path=kg_db_path) | |
| 201 | 201 | else: |
| 202 | 202 | logger.info("Building knowledge graph...") |
| 203 | - kg = KnowledgeGraph(provider_manager=pm) | |
| 203 | + kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) | |
| 204 | 204 | kg.process_transcript(transcript_data) |
| 205 | 205 | if diagrams: |
| 206 | 206 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 207 | 207 | kg.process_diagrams(diagram_dicts) |
| 208 | 208 | kg.save(kg_json_path) |
| @@ -265,10 +265,11 @@ | ||
| 265 | 265 | transcript_json="transcript/transcript.json", |
| 266 | 266 | transcript_txt="transcript/transcript.txt", |
| 267 | 267 | transcript_srt="transcript/transcript.srt", |
| 268 | 268 | analysis_md="results/analysis.md", |
| 269 | 269 | knowledge_graph_json="results/knowledge_graph.json", |
| 270 | + knowledge_graph_db="results/knowledge_graph.db", | |
| 270 | 271 | key_points_json="results/key_points.json", |
| 271 | 272 | action_items_json="results/action_items.json", |
| 272 | 273 | key_points=key_points, |
| 273 | 274 | action_items=action_items, |
| 274 | 275 | diagrams=diagrams, |
| 275 | 276 |
| --- video_processor/pipeline.py | |
| +++ video_processor/pipeline.py | |
| @@ -191,18 +191,18 @@ | |
| 191 | |
| 192 | # --- Step 5: Knowledge graph --- |
| 193 | pm.usage.start_step("Knowledge graph") |
| 194 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 195 | kg_json_path = dirs["results"] / "knowledge_graph.json" |
| 196 | if kg_json_path.exists(): |
| 197 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 198 | kg_data = json.loads(kg_json_path.read_text()) |
| 199 | kg = KnowledgeGraph(provider_manager=pm) |
| 200 | kg = KnowledgeGraph.from_dict(kg_data) |
| 201 | else: |
| 202 | logger.info("Building knowledge graph...") |
| 203 | kg = KnowledgeGraph(provider_manager=pm) |
| 204 | kg.process_transcript(transcript_data) |
| 205 | if diagrams: |
| 206 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 207 | kg.process_diagrams(diagram_dicts) |
| 208 | kg.save(kg_json_path) |
| @@ -265,10 +265,11 @@ | |
| 265 | transcript_json="transcript/transcript.json", |
| 266 | transcript_txt="transcript/transcript.txt", |
| 267 | transcript_srt="transcript/transcript.srt", |
| 268 | analysis_md="results/analysis.md", |
| 269 | knowledge_graph_json="results/knowledge_graph.json", |
| 270 | key_points_json="results/key_points.json", |
| 271 | action_items_json="results/action_items.json", |
| 272 | key_points=key_points, |
| 273 | action_items=action_items, |
| 274 | diagrams=diagrams, |
| 275 |
| --- video_processor/pipeline.py | |
| +++ video_processor/pipeline.py | |
| @@ -191,18 +191,18 @@ | |
| 191 | |
| 192 | # --- Step 5: Knowledge graph --- |
| 193 | pm.usage.start_step("Knowledge graph") |
| 194 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 195 | kg_json_path = dirs["results"] / "knowledge_graph.json" |
| 196 | kg_db_path = dirs["results"] / "knowledge_graph.db" |
| 197 | if kg_json_path.exists(): |
| 198 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 199 | kg_data = json.loads(kg_json_path.read_text()) |
| 200 | kg = KnowledgeGraph.from_dict(kg_data, db_path=kg_db_path) |
| 201 | else: |
| 202 | logger.info("Building knowledge graph...") |
| 203 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
| 204 | kg.process_transcript(transcript_data) |
| 205 | if diagrams: |
| 206 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 207 | kg.process_diagrams(diagram_dicts) |
| 208 | kg.save(kg_json_path) |
| @@ -265,10 +265,11 @@ | |
| 265 | transcript_json="transcript/transcript.json", |
| 266 | transcript_txt="transcript/transcript.txt", |
| 267 | transcript_srt="transcript/transcript.srt", |
| 268 | analysis_md="results/analysis.md", |
| 269 | knowledge_graph_json="results/knowledge_graph.json", |
| 270 | knowledge_graph_db="results/knowledge_graph.db", |
| 271 | key_points_json="results/key_points.json", |
| 272 | action_items_json="results/action_items.json", |
| 273 | key_points=key_points, |
| 274 | action_items=action_items, |
| 275 | diagrams=diagrams, |
| 276 |