|
0981a08…
|
noreply
|
1 |
"""Google Keep source connector using the gws CLI (googleworkspace/cli). |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
Fetches notes from Google Keep via the `gws` CLI tool. |
|
0981a08…
|
noreply
|
4 |
Outputs plain text suitable for KG ingestion. |
|
0981a08…
|
noreply
|
5 |
|
|
0981a08…
|
noreply
|
6 |
Requires: npm install -g @googleworkspace/cli |
|
0981a08…
|
noreply
|
7 |
Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
|
0981a08…
|
noreply
|
8 |
""" |
|
0981a08…
|
noreply
|
9 |
|
|
0981a08…
|
noreply
|
10 |
import json |
|
0981a08…
|
noreply
|
11 |
import logging |
|
0981a08…
|
noreply
|
12 |
import shutil |
|
0981a08…
|
noreply
|
13 |
import subprocess |
|
0981a08…
|
noreply
|
14 |
from pathlib import Path |
|
0981a08…
|
noreply
|
15 |
from typing import Any, Dict, List, Optional |
|
0981a08…
|
noreply
|
16 |
|
|
0981a08…
|
noreply
|
17 |
from video_processor.sources.base import BaseSource, SourceFile |
|
0981a08…
|
noreply
|
18 |
|
|
0981a08…
|
noreply
|
19 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
20 |
|
|
0981a08…
|
noreply
|
21 |
|
|
0981a08…
|
noreply
|
22 |
def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: |
|
0981a08…
|
noreply
|
23 |
"""Run a gws CLI command and return parsed JSON output.""" |
|
0981a08…
|
noreply
|
24 |
cmd = ["gws"] + args |
|
0981a08…
|
noreply
|
25 |
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
|
0981a08…
|
noreply
|
26 |
if proc.returncode != 0: |
|
0981a08…
|
noreply
|
27 |
raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") |
|
0981a08…
|
noreply
|
28 |
try: |
|
0981a08…
|
noreply
|
29 |
return json.loads(proc.stdout) |
|
0981a08…
|
noreply
|
30 |
except json.JSONDecodeError: |
|
0981a08…
|
noreply
|
31 |
return {"raw": proc.stdout.strip()} |
|
0981a08…
|
noreply
|
32 |
|
|
0981a08…
|
noreply
|
33 |
|
|
0981a08…
|
noreply
|
34 |
def _note_to_text(note: dict) -> str: |
|
0981a08…
|
noreply
|
35 |
"""Extract text content from a Google Keep note structure. |
|
0981a08…
|
noreply
|
36 |
|
|
0981a08…
|
noreply
|
37 |
Handles plain text notes and checklists. Checklist items are formatted |
|
0981a08…
|
noreply
|
38 |
as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked). |
|
0981a08…
|
noreply
|
39 |
""" |
|
0981a08…
|
noreply
|
40 |
parts: List[str] = [] |
|
0981a08…
|
noreply
|
41 |
|
|
0981a08…
|
noreply
|
42 |
title = note.get("title", "").strip() |
|
0981a08…
|
noreply
|
43 |
if title: |
|
0981a08…
|
noreply
|
44 |
parts.append(title) |
|
0981a08…
|
noreply
|
45 |
|
|
0981a08…
|
noreply
|
46 |
body = note.get("body", note.get("textContent", "")).strip() |
|
0981a08…
|
noreply
|
47 |
if body: |
|
0981a08…
|
noreply
|
48 |
parts.append(body) |
|
0981a08…
|
noreply
|
49 |
|
|
0981a08…
|
noreply
|
50 |
# Checklist items may appear under "list", "listContent", or "checklistItems" |
|
0981a08…
|
noreply
|
51 |
list_items = note.get("list", note.get("listContent", note.get("checklistItems", []))) |
|
0981a08…
|
noreply
|
52 |
if isinstance(list_items, list): |
|
0981a08…
|
noreply
|
53 |
for item in list_items: |
|
0981a08…
|
noreply
|
54 |
text = item.get("text", "").strip() |
|
0981a08…
|
noreply
|
55 |
if not text: |
|
0981a08…
|
noreply
|
56 |
continue |
|
0981a08…
|
noreply
|
57 |
checked = item.get("checked", item.get("isChecked", False)) |
|
0981a08…
|
noreply
|
58 |
marker = "[x]" if checked else "[ ]" |
|
0981a08…
|
noreply
|
59 |
parts.append(f"- {marker} {text}") |
|
0981a08…
|
noreply
|
60 |
|
|
0981a08…
|
noreply
|
61 |
return "\n\n".join(parts) if parts else "" |
|
0981a08…
|
noreply
|
62 |
|
|
0981a08…
|
noreply
|
63 |
|
|
0981a08…
|
noreply
|
64 |
class GoogleKeepSource(BaseSource): |
|
0981a08…
|
noreply
|
65 |
""" |
|
0981a08…
|
noreply
|
66 |
Fetch notes from Google Keep via the gws CLI. |
|
0981a08…
|
noreply
|
67 |
|
|
0981a08…
|
noreply
|
68 |
Usage: |
|
0981a08…
|
noreply
|
69 |
source = GoogleKeepSource() # all notes |
|
0981a08…
|
noreply
|
70 |
source = GoogleKeepSource(label="meetings") # filter by label |
|
0981a08…
|
noreply
|
71 |
files = source.list_videos() |
|
0981a08…
|
noreply
|
72 |
source.download_all(files, Path("./notes")) |
|
0981a08…
|
noreply
|
73 |
""" |
|
0981a08…
|
noreply
|
74 |
|
|
0981a08…
|
noreply
|
75 |
def __init__(self, label: Optional[str] = None): |
|
0981a08…
|
noreply
|
76 |
self.label = label |
|
0981a08…
|
noreply
|
77 |
|
|
0981a08…
|
noreply
|
78 |
def authenticate(self) -> bool: |
|
0981a08…
|
noreply
|
79 |
"""Check if gws CLI is installed and authenticated.""" |
|
0981a08…
|
noreply
|
80 |
if not shutil.which("gws"): |
|
0981a08…
|
noreply
|
81 |
logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
|
0981a08…
|
noreply
|
82 |
return False |
|
0981a08…
|
noreply
|
83 |
try: |
|
0981a08…
|
noreply
|
84 |
_run_gws(["auth", "status"], timeout=10) |
|
0981a08…
|
noreply
|
85 |
return True |
|
0981a08…
|
noreply
|
86 |
except (RuntimeError, subprocess.TimeoutExpired): |
|
0981a08…
|
noreply
|
87 |
logger.error("gws not authenticated. Run: gws auth login") |
|
0981a08…
|
noreply
|
88 |
return False |
|
0981a08…
|
noreply
|
89 |
|
|
0981a08…
|
noreply
|
90 |
def list_videos( |
|
0981a08…
|
noreply
|
91 |
self, |
|
0981a08…
|
noreply
|
92 |
folder_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
93 |
folder_path: Optional[str] = None, |
|
0981a08…
|
noreply
|
94 |
patterns: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
95 |
) -> List[SourceFile]: |
|
0981a08…
|
noreply
|
96 |
"""List notes in Google Keep. Returns SourceFile per note.""" |
|
0981a08…
|
noreply
|
97 |
args = ["keep", "notes", "list", "--output", "json"] |
|
0981a08…
|
noreply
|
98 |
|
|
0981a08…
|
noreply
|
99 |
if self.label: |
|
0981a08…
|
noreply
|
100 |
args.extend(["--label", self.label]) |
|
0981a08…
|
noreply
|
101 |
|
|
0981a08…
|
noreply
|
102 |
try: |
|
0981a08…
|
noreply
|
103 |
result = _run_gws(args, timeout=60) |
|
0981a08…
|
noreply
|
104 |
except RuntimeError as e: |
|
0981a08…
|
noreply
|
105 |
logger.error(f"Failed to list Keep notes: {e}") |
|
0981a08…
|
noreply
|
106 |
return [] |
|
0981a08…
|
noreply
|
107 |
|
|
0981a08…
|
noreply
|
108 |
# Result may be a list directly or nested under a key |
|
0981a08…
|
noreply
|
109 |
notes: List[dict] = [] |
|
0981a08…
|
noreply
|
110 |
if isinstance(result, list): |
|
0981a08…
|
noreply
|
111 |
notes = result |
|
0981a08…
|
noreply
|
112 |
elif isinstance(result, dict): |
|
0981a08…
|
noreply
|
113 |
notes = result.get("notes", result.get("items", [])) |
|
0981a08…
|
noreply
|
114 |
# If we got a single note back (not a list), wrap it |
|
0981a08…
|
noreply
|
115 |
if not notes and "id" in result and "raw" not in result: |
|
0981a08…
|
noreply
|
116 |
notes = [result] |
|
0981a08…
|
noreply
|
117 |
|
|
0981a08…
|
noreply
|
118 |
files: List[SourceFile] = [] |
|
0981a08…
|
noreply
|
119 |
for note in notes: |
|
0981a08…
|
noreply
|
120 |
note_id = note.get("id", note.get("noteId", "")) |
|
0981a08…
|
noreply
|
121 |
title = note.get("title", "Untitled Note").strip() or "Untitled Note" |
|
0981a08…
|
noreply
|
122 |
modified = note.get("modifiedTime", note.get("updateTime")) |
|
0981a08…
|
noreply
|
123 |
|
|
0981a08…
|
noreply
|
124 |
# Estimate size from text content |
|
0981a08…
|
noreply
|
125 |
text = _note_to_text(note) |
|
0981a08…
|
noreply
|
126 |
size = len(text.encode("utf-8")) if text else None |
|
0981a08…
|
noreply
|
127 |
|
|
0981a08…
|
noreply
|
128 |
files.append( |
|
0981a08…
|
noreply
|
129 |
SourceFile( |
|
0981a08…
|
noreply
|
130 |
name=title, |
|
0981a08…
|
noreply
|
131 |
id=str(note_id), |
|
0981a08…
|
noreply
|
132 |
size_bytes=size, |
|
0981a08…
|
noreply
|
133 |
mime_type="text/plain", |
|
0981a08…
|
noreply
|
134 |
modified_at=modified, |
|
0981a08…
|
noreply
|
135 |
) |
|
0981a08…
|
noreply
|
136 |
) |
|
0981a08…
|
noreply
|
137 |
|
|
0981a08…
|
noreply
|
138 |
logger.info(f"Found {len(files)} note(s) in Google Keep") |
|
0981a08…
|
noreply
|
139 |
return files |
|
0981a08…
|
noreply
|
140 |
|
|
0981a08…
|
noreply
|
141 |
def download(self, file: SourceFile, destination: Path) -> Path: |
|
0981a08…
|
noreply
|
142 |
"""Download a Keep note's content as a text file.""" |
|
0981a08…
|
noreply
|
143 |
destination = Path(destination) |
|
0981a08…
|
noreply
|
144 |
destination.parent.mkdir(parents=True, exist_ok=True) |
|
0981a08…
|
noreply
|
145 |
|
|
0981a08…
|
noreply
|
146 |
try: |
|
0981a08…
|
noreply
|
147 |
result = _run_gws( |
|
0981a08…
|
noreply
|
148 |
[ |
|
0981a08…
|
noreply
|
149 |
"keep", |
|
0981a08…
|
noreply
|
150 |
"notes", |
|
0981a08…
|
noreply
|
151 |
"get", |
|
0981a08…
|
noreply
|
152 |
"--params", |
|
0981a08…
|
noreply
|
153 |
json.dumps({"noteId": file.id}), |
|
0981a08…
|
noreply
|
154 |
], |
|
0981a08…
|
noreply
|
155 |
timeout=30, |
|
0981a08…
|
noreply
|
156 |
) |
|
0981a08…
|
noreply
|
157 |
except RuntimeError as e: |
|
0981a08…
|
noreply
|
158 |
raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e |
|
0981a08…
|
noreply
|
159 |
|
|
0981a08…
|
noreply
|
160 |
# result may be the note dict directly or wrapped |
|
0981a08…
|
noreply
|
161 |
note = result if isinstance(result, dict) else {} |
|
0981a08…
|
noreply
|
162 |
text = _note_to_text(note) |
|
0981a08…
|
noreply
|
163 |
|
|
0981a08…
|
noreply
|
164 |
if not text: |
|
0981a08…
|
noreply
|
165 |
# Fallback: use raw output if structured extraction yielded nothing |
|
0981a08…
|
noreply
|
166 |
text = note.get("raw", json.dumps(note, indent=2)) |
|
0981a08…
|
noreply
|
167 |
|
|
0981a08…
|
noreply
|
168 |
destination.write_text(text, encoding="utf-8") |
|
0981a08…
|
noreply
|
169 |
logger.info(f"Saved note '{file.name}' to {destination}") |
|
0981a08…
|
noreply
|
170 |
return destination |