PlanOpticon

planopticon / video_processor / sources / notion_source.py

Source Blame History 380 lines

0981a08…	noreply	1	"""Notion API source connector for fetching pages and databases."""
0981a08…	noreply	2
0981a08…	noreply	3	import logging
0981a08…	noreply	4	import os
0981a08…	noreply	5	from pathlib import Path
0981a08…	noreply	6	from typing import Dict, List, Optional
0981a08…	noreply	7
0981a08…	noreply	8	import requests
0981a08…	noreply	9
0981a08…	noreply	10	from video_processor.sources.base import BaseSource, SourceFile
0981a08…	noreply	11
0981a08…	noreply	12	logger = logging.getLogger(__name__)
0981a08…	noreply	13
0981a08…	noreply	14	NOTION_VERSION = "2022-06-28"
0981a08…	noreply	15	NOTION_BASE_URL = "https://api.notion.com/v1"
0981a08…	noreply	16
0981a08…	noreply	17
0981a08…	noreply	18	class NotionSource(BaseSource):
0981a08…	noreply	19	"""
0981a08…	noreply	20	Fetch pages and databases from Notion via the public API.
0981a08…	noreply	21
0981a08…	noreply	22	Requires a Notion integration token (internal integration).
0981a08…	noreply	23	Set NOTION_API_KEY env var or pass token directly.
0981a08…	noreply	24
0981a08…	noreply	25	Requires: pip install requests
0981a08…	noreply	26	"""
0981a08…	noreply	27
0981a08…	noreply	28	def __init__(
0981a08…	noreply	29	self,
0981a08…	noreply	30	token: Optional[str] = None,
0981a08…	noreply	31	database_id: Optional[str] = None,
0981a08…	noreply	32	page_ids: Optional[List[str]] = None,
0981a08…	noreply	33	):
0981a08…	noreply	34	self.token = token or os.environ.get("NOTION_API_KEY", "")
0981a08…	noreply	35	self.database_id = database_id
0981a08…	noreply	36	self.page_ids = page_ids or []
0981a08…	noreply	37
0981a08…	noreply	38	def _headers(self) -> Dict[str, str]:
0981a08…	noreply	39	return {
0981a08…	noreply	40	"Authorization": f"Bearer {self.token}",
0981a08…	noreply	41	"Notion-Version": NOTION_VERSION,
0981a08…	noreply	42	"Content-Type": "application/json",
0981a08…	noreply	43	}
0981a08…	noreply	44
0981a08…	noreply	45	def authenticate(self) -> bool:
0981a08…	noreply	46	"""Check token is set and make a test call to the Notion API."""
0981a08…	noreply	47	if not self.token:
0981a08…	noreply	48	logger.error("Notion token not set. Provide token or set NOTION_API_KEY.")
0981a08…	noreply	49	return False
0981a08…	noreply	50	try:
0981a08…	noreply	51	resp = requests.get(
0981a08…	noreply	52	f"{NOTION_BASE_URL}/users/me",
0981a08…	noreply	53	headers=self._headers(),
0981a08…	noreply	54	timeout=15,
0981a08…	noreply	55	)
0981a08…	noreply	56	resp.raise_for_status()
0981a08…	noreply	57	user = resp.json()
0981a08…	noreply	58	logger.info("Authenticated with Notion as %s", user.get("name", "unknown"))
0981a08…	noreply	59	return True
0981a08…	noreply	60	except requests.RequestException as exc:
0981a08…	noreply	61	logger.error("Notion authentication failed: %s", exc)
0981a08…	noreply	62	return False
0981a08…	noreply	63
0981a08…	noreply	64	def list_videos(
0981a08…	noreply	65	self,
0981a08…	noreply	66	folder_id: Optional[str] = None,
0981a08…	noreply	67	folder_path: Optional[str] = None,
0981a08…	noreply	68	patterns: Optional[List[str]] = None,
0981a08…	noreply	69	) -> List[SourceFile]:
0981a08…	noreply	70	"""List Notion pages as SourceFiles.
0981a08…	noreply	71
0981a08…	noreply	72	If database_id is set, query the database for pages.
0981a08…	noreply	73	If page_ids is set, fetch each page individually.
0981a08…	noreply	74	"""
0981a08…	noreply	75	files: List[SourceFile] = []
0981a08…	noreply	76
0981a08…	noreply	77	if self.database_id:
0981a08…	noreply	78	files.extend(self._list_from_database(self.database_id))
0981a08…	noreply	79
0981a08…	noreply	80	if self.page_ids:
0981a08…	noreply	81	files.extend(self._list_from_pages(self.page_ids))
0981a08…	noreply	82
0981a08…	noreply	83	if not files:
0981a08…	noreply	84	logger.warning("No pages found. Set database_id or page_ids.")
0981a08…	noreply	85
0981a08…	noreply	86	return files
0981a08…	noreply	87
0981a08…	noreply	88	def _list_from_database(self, database_id: str) -> List[SourceFile]:
0981a08…	noreply	89	"""Query a Notion database and return SourceFiles for each row."""
0981a08…	noreply	90	files: List[SourceFile] = []
0981a08…	noreply	91	has_more = True
0981a08…	noreply	92	start_cursor: Optional[str] = None
0981a08…	noreply	93
0981a08…	noreply	94	while has_more:
0981a08…	noreply	95	body: Dict = {}
0981a08…	noreply	96	if start_cursor:
0981a08…	noreply	97	body["start_cursor"] = start_cursor
0981a08…	noreply	98
0981a08…	noreply	99	resp = requests.post(
0981a08…	noreply	100	f"{NOTION_BASE_URL}/databases/{database_id}/query",
0981a08…	noreply	101	headers=self._headers(),
0981a08…	noreply	102	json=body,
0981a08…	noreply	103	timeout=30,
0981a08…	noreply	104	)
0981a08…	noreply	105	resp.raise_for_status()
0981a08…	noreply	106	data = resp.json()
0981a08…	noreply	107
0981a08…	noreply	108	for page in data.get("results", []):
0981a08…	noreply	109	title = _extract_page_title(page)
0981a08…	noreply	110	files.append(
0981a08…	noreply	111	SourceFile(
0981a08…	noreply	112	name=title,
0981a08…	noreply	113	id=page["id"],
0981a08…	noreply	114	mime_type="text/markdown",
0981a08…	noreply	115	modified_at=page.get("last_edited_time"),
0981a08…	noreply	116	)
0981a08…	noreply	117	)
0981a08…	noreply	118
0981a08…	noreply	119	has_more = data.get("has_more", False)
0981a08…	noreply	120	start_cursor = data.get("next_cursor")
0981a08…	noreply	121
0981a08…	noreply	122	return files
0981a08…	noreply	123
0981a08…	noreply	124	def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]:
0981a08…	noreply	125	"""Fetch individual pages by ID and return SourceFiles."""
0981a08…	noreply	126	files: List[SourceFile] = []
0981a08…	noreply	127	for page_id in page_ids:
0981a08…	noreply	128	try:
0981a08…	noreply	129	resp = requests.get(
0981a08…	noreply	130	f"{NOTION_BASE_URL}/pages/{page_id}",
0981a08…	noreply	131	headers=self._headers(),
0981a08…	noreply	132	timeout=15,
0981a08…	noreply	133	)
0981a08…	noreply	134	resp.raise_for_status()
0981a08…	noreply	135	page = resp.json()
0981a08…	noreply	136	title = _extract_page_title(page)
0981a08…	noreply	137	files.append(
0981a08…	noreply	138	SourceFile(
0981a08…	noreply	139	name=title,
0981a08…	noreply	140	id=page["id"],
0981a08…	noreply	141	mime_type="text/markdown",
0981a08…	noreply	142	modified_at=page.get("last_edited_time"),
0981a08…	noreply	143	)
0981a08…	noreply	144	)
0981a08…	noreply	145	except requests.RequestException as exc:
0981a08…	noreply	146	logger.error("Failed to fetch page %s: %s", page_id, exc)
0981a08…	noreply	147	return files
0981a08…	noreply	148
0981a08…	noreply	149	def download(self, file: SourceFile, destination: Path) -> Path:
0981a08…	noreply	150	"""Download page blocks as markdown text and save to destination."""
0981a08…	noreply	151	destination = Path(destination)
0981a08…	noreply	152	destination.parent.mkdir(parents=True, exist_ok=True)
0981a08…	noreply	153
0981a08…	noreply	154	blocks = self._fetch_all_blocks(file.id)
0981a08…	noreply	155	text = self._blocks_to_text(blocks)
0981a08…	noreply	156
0981a08…	noreply	157	# Prepend title
0981a08…	noreply	158	content = f"# {file.name}\n\n{text}"
0981a08…	noreply	159	destination.write_text(content, encoding="utf-8")
0981a08…	noreply	160	logger.info("Saved Notion page to %s", destination)
0981a08…	noreply	161	return destination
0981a08…	noreply	162
0981a08…	noreply	163	def _fetch_all_blocks(self, page_id: str) -> list:
0981a08…	noreply	164	"""Fetch all child blocks for a page, handling pagination."""
0981a08…	noreply	165	blocks: list = []
0981a08…	noreply	166	has_more = True
0981a08…	noreply	167	start_cursor: Optional[str] = None
0981a08…	noreply	168
0981a08…	noreply	169	while has_more:
0981a08…	noreply	170	url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100"
0981a08…	noreply	171	if start_cursor:
0981a08…	noreply	172	url += f"&start_cursor={start_cursor}"
0981a08…	noreply	173
0981a08…	noreply	174	resp = requests.get(url, headers=self._headers(), timeout=30)
0981a08…	noreply	175	resp.raise_for_status()
0981a08…	noreply	176	data = resp.json()
0981a08…	noreply	177
0981a08…	noreply	178	blocks.extend(data.get("results", []))
0981a08…	noreply	179	has_more = data.get("has_more", False)
0981a08…	noreply	180	start_cursor = data.get("next_cursor")
0981a08…	noreply	181
0981a08…	noreply	182	return blocks
0981a08…	noreply	183
0981a08…	noreply	184	def _blocks_to_text(self, blocks: list) -> str:
0981a08…	noreply	185	"""Convert Notion block objects to markdown text."""
0981a08…	noreply	186	lines: List[str] = []
0981a08…	noreply	187	numbered_index = 0
0981a08…	noreply	188
0981a08…	noreply	189	for block in blocks:
0981a08…	noreply	190	block_type = block.get("type", "")
0981a08…	noreply	191	block_data = block.get(block_type, {})
0981a08…	noreply	192
0981a08…	noreply	193	if block_type == "paragraph":
0981a08…	noreply	194	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	195	lines.append(text)
0981a08…	noreply	196	numbered_index = 0
0981a08…	noreply	197
0981a08…	noreply	198	elif block_type == "heading_1":
0981a08…	noreply	199	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	200	lines.append(f"# {text}")
0981a08…	noreply	201	numbered_index = 0
0981a08…	noreply	202
0981a08…	noreply	203	elif block_type == "heading_2":
0981a08…	noreply	204	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	205	lines.append(f"## {text}")
0981a08…	noreply	206	numbered_index = 0
0981a08…	noreply	207
0981a08…	noreply	208	elif block_type == "heading_3":
0981a08…	noreply	209	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	210	lines.append(f"### {text}")
0981a08…	noreply	211	numbered_index = 0
0981a08…	noreply	212
0981a08…	noreply	213	elif block_type == "bulleted_list_item":
0981a08…	noreply	214	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	215	lines.append(f"- {text}")
0981a08…	noreply	216	numbered_index = 0
0981a08…	noreply	217
0981a08…	noreply	218	elif block_type == "numbered_list_item":
0981a08…	noreply	219	numbered_index += 1
0981a08…	noreply	220	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	221	lines.append(f"{numbered_index}. {text}")
0981a08…	noreply	222
0981a08…	noreply	223	elif block_type == "to_do":
0981a08…	noreply	224	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	225	checked = block_data.get("checked", False)
0981a08…	noreply	226	marker = "[x]" if checked else "[ ]"
0981a08…	noreply	227	lines.append(f"- {marker} {text}")
0981a08…	noreply	228	numbered_index = 0
0981a08…	noreply	229
0981a08…	noreply	230	elif block_type == "code":
0981a08…	noreply	231	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	232	language = block_data.get("language", "")
0981a08…	noreply	233	lines.append(f"```{language}")
0981a08…	noreply	234	lines.append(text)
0981a08…	noreply	235	lines.append("```")
0981a08…	noreply	236	numbered_index = 0
0981a08…	noreply	237
0981a08…	noreply	238	elif block_type == "quote":
0981a08…	noreply	239	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	240	lines.append(f"> {text}")
0981a08…	noreply	241	numbered_index = 0
0981a08…	noreply	242
0981a08…	noreply	243	elif block_type == "callout":
0981a08…	noreply	244	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	245	icon = block_data.get("icon", {})
0981a08…	noreply	246	emoji = icon.get("emoji", "") if icon else ""
0981a08…	noreply	247	prefix = f"{emoji} " if emoji else ""
0981a08…	noreply	248	lines.append(f"> {prefix}{text}")
0981a08…	noreply	249	numbered_index = 0
0981a08…	noreply	250
0981a08…	noreply	251	elif block_type == "toggle":
0981a08…	noreply	252	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	253	lines.append(f"<details><summary>{text}</summary></details>")
0981a08…	noreply	254	numbered_index = 0
0981a08…	noreply	255
0981a08…	noreply	256	elif block_type == "divider":
0981a08…	noreply	257	lines.append("---")
0981a08…	noreply	258	numbered_index = 0
0981a08…	noreply	259
0981a08…	noreply	260	else:
0981a08…	noreply	261	# Unsupported block type — try to extract any rich_text
0981a08…	noreply	262	text = _rich_text_to_str(block_data.get("rich_text", []))
0981a08…	noreply	263	if text:
0981a08…	noreply	264	lines.append(text)
0981a08…	noreply	265	numbered_index = 0
0981a08…	noreply	266
0981a08…	noreply	267	return "\n\n".join(lines)
0981a08…	noreply	268
0981a08…	noreply	269	def fetch_database_as_table(self, database_id: str) -> str:
0981a08…	noreply	270	"""Fetch a Notion database and return its rows as CSV-like text.
0981a08…	noreply	271
0981a08…	noreply	272	Each row is a page in the database. Columns are derived from
0981a08…	noreply	273	the database properties.
0981a08…	noreply	274	"""
0981a08…	noreply	275	# First, get database schema for column order
0981a08…	noreply	276	resp = requests.get(
0981a08…	noreply	277	f"{NOTION_BASE_URL}/databases/{database_id}",
0981a08…	noreply	278	headers=self._headers(),
0981a08…	noreply	279	timeout=15,
0981a08…	noreply	280	)
0981a08…	noreply	281	resp.raise_for_status()
0981a08…	noreply	282	db_meta = resp.json()
0981a08…	noreply	283	properties = db_meta.get("properties", {})
0981a08…	noreply	284	columns = sorted(properties.keys())
0981a08…	noreply	285
0981a08…	noreply	286	# Query all rows
0981a08…	noreply	287	rows: List[Dict] = []
0981a08…	noreply	288	has_more = True
0981a08…	noreply	289	start_cursor: Optional[str] = None
0981a08…	noreply	290
0981a08…	noreply	291	while has_more:
0981a08…	noreply	292	body: Dict = {}
0981a08…	noreply	293	if start_cursor:
0981a08…	noreply	294	body["start_cursor"] = start_cursor
0981a08…	noreply	295
0981a08…	noreply	296	resp = requests.post(
0981a08…	noreply	297	f"{NOTION_BASE_URL}/databases/{database_id}/query",
0981a08…	noreply	298	headers=self._headers(),
0981a08…	noreply	299	json=body,
0981a08…	noreply	300	timeout=30,
0981a08…	noreply	301	)
0981a08…	noreply	302	resp.raise_for_status()
0981a08…	noreply	303	data = resp.json()
0981a08…	noreply	304	rows.extend(data.get("results", []))
0981a08…	noreply	305	has_more = data.get("has_more", False)
0981a08…	noreply	306	start_cursor = data.get("next_cursor")
0981a08…	noreply	307
0981a08…	noreply	308	# Build CSV-like output
0981a08…	noreply	309	lines: List[str] = []
0981a08…	noreply	310	lines.append(",".join(columns))
0981a08…	noreply	311
0981a08…	noreply	312	for row in rows:
0981a08…	noreply	313	row_props = row.get("properties", {})
0981a08…	noreply	314	values: List[str] = []
0981a08…	noreply	315	for col in columns:
0981a08…	noreply	316	prop = row_props.get(col, {})
0981a08…	noreply	317	values.append(_extract_property_value(prop))
0981a08…	noreply	318	lines.append(",".join(values))
0981a08…	noreply	319
0981a08…	noreply	320	return "\n".join(lines)
0981a08…	noreply	321
0981a08…	noreply	322
0981a08…	noreply	323	def _rich_text_to_str(rich_text: list) -> str:
0981a08…	noreply	324	"""Extract plain text from a Notion rich_text array."""
0981a08…	noreply	325	return "".join(item.get("plain_text", "") for item in rich_text)
0981a08…	noreply	326
0981a08…	noreply	327
0981a08…	noreply	328	def _extract_page_title(page: dict) -> str:
0981a08…	noreply	329	"""Extract the title from a Notion page object."""
0981a08…	noreply	330	properties = page.get("properties", {})
0981a08…	noreply	331	for prop in properties.values():
0981a08…	noreply	332	if prop.get("type") == "title":
0981a08…	noreply	333	return _rich_text_to_str(prop.get("title", []))
0981a08…	noreply	334	return "Untitled"
0981a08…	noreply	335
0981a08…	noreply	336
0981a08…	noreply	337	def _extract_property_value(prop: dict) -> str:
0981a08…	noreply	338	"""Extract a display string from a Notion property value."""
0981a08…	noreply	339	prop_type = prop.get("type", "")
0981a08…	noreply	340
0981a08…	noreply	341	if prop_type == "title":
0981a08…	noreply	342	return _rich_text_to_str(prop.get("title", []))
0981a08…	noreply	343	elif prop_type == "rich_text":
0981a08…	noreply	344	return _rich_text_to_str(prop.get("rich_text", []))
0981a08…	noreply	345	elif prop_type == "number":
0981a08…	noreply	346	val = prop.get("number")
0981a08…	noreply	347	return str(val) if val is not None else ""
0981a08…	noreply	348	elif prop_type == "select":
0981a08…	noreply	349	sel = prop.get("select")
0981a08…	noreply	350	return sel.get("name", "") if sel else ""
0981a08…	noreply	351	elif prop_type == "multi_select":
0981a08…	noreply	352	return "; ".join(s.get("name", "") for s in prop.get("multi_select", []))
0981a08…	noreply	353	elif prop_type == "date":
0981a08…	noreply	354	date = prop.get("date")
0981a08…	noreply	355	if date:
0981a08…	noreply	356	start = date.get("start", "")
0981a08…	noreply	357	end = date.get("end", "")
0981a08…	noreply	358	return f"{start} - {end}" if end else start
0981a08…	noreply	359	return ""
0981a08…	noreply	360	elif prop_type == "checkbox":
0981a08…	noreply	361	return str(prop.get("checkbox", False))
0981a08…	noreply	362	elif prop_type == "url":
0981a08…	noreply	363	return prop.get("url", "") or ""
0981a08…	noreply	364	elif prop_type == "email":
0981a08…	noreply	365	return prop.get("email", "") or ""
0981a08…	noreply	366	elif prop_type == "phone_number":
0981a08…	noreply	367	return prop.get("phone_number", "") or ""
0981a08…	noreply	368	elif prop_type == "status":
0981a08…	noreply	369	status = prop.get("status")
0981a08…	noreply	370	return status.get("name", "") if status else ""
0981a08…	noreply	371	elif prop_type == "people":
0981a08…	noreply	372	return "; ".join(p.get("name", "") for p in prop.get("people", []))
0981a08…	noreply	373	elif prop_type == "relation":
0981a08…	noreply	374	return "; ".join(r.get("id", "") for r in prop.get("relation", []))
0981a08…	noreply	375	elif prop_type == "formula":
0981a08…	noreply	376	formula = prop.get("formula", {})
0981a08…	noreply	377	f_type = formula.get("type", "")
0981a08…	noreply	378	return str(formula.get(f_type, ""))
0981a08…	noreply	379	else:
0981a08…	noreply	380	return ""

PlanOpticon

Keyboard Shortcuts