PlanOpticon

planopticon / video_processor / providers / huggingface_provider.py

Blame History Raw 188 lines

1	`"""Hugging Face Inference API provider implementation."""`
2
3	`import base64`
4	`import logging`
5	`import os`
6	`from pathlib import Path`
7	`from typing import Optional`
8
9	`from dotenv import load_dotenv`
10
11	`from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry`
12
13	`load_dotenv()`
14	`logger = logging.getLogger(__name__)`
15
16	`# Curated list of popular HF Inference models`
17	`_HF_MODELS = [`
18	`ModelInfo(`
19	`id="meta-llama/Llama-3.1-70B-Instruct",`
20	`provider="huggingface",`
21	`display_name="Llama 3.1 70B Instruct",`
22	`capabilities=["chat"],`
23	`),`
24	`ModelInfo(`
25	`id="meta-llama/Llama-3.1-8B-Instruct",`
26	`provider="huggingface",`
27	`display_name="Llama 3.1 8B Instruct",`
28	`capabilities=["chat"],`
29	`),`
30	`ModelInfo(`
31	`id="mistralai/Mixtral-8x7B-Instruct-v0.1",`
32	`provider="huggingface",`
33	`display_name="Mixtral 8x7B Instruct",`
34	`capabilities=["chat"],`
35	`),`
36	`ModelInfo(`
37	`id="microsoft/Phi-3-mini-4k-instruct",`
38	`provider="huggingface",`
39	`display_name="Phi-3 Mini 4K Instruct",`
40	`capabilities=["chat"],`
41	`),`
42	`ModelInfo(`
43	`id="llava-hf/llava-v1.6-mistral-7b-hf",`
44	`provider="huggingface",`
45	`display_name="LLaVA v1.6 Mistral 7B",`
46	`capabilities=["chat", "vision"],`
47	`),`
48	`ModelInfo(`
49	`id="openai/whisper-large-v3",`
50	`provider="huggingface",`
51	`display_name="Whisper Large v3",`
52	`capabilities=["audio"],`
53	`),`
54	`]`
55
56
57	`class HuggingFaceProvider(BaseProvider):`
58	`"""Hugging Face Inference API provider using huggingface_hub."""`
59
60	`provider_name = "huggingface"`
61
62	`def __init__(self, token: Optional[str] = None):`
63	`try:`
64	`from huggingface_hub import InferenceClient`
65	`except ImportError:`
66	`raise ImportError(`
67	`"huggingface_hub package not installed. Install with: pip install huggingface_hub"`
68	`)`
69
70	`self._token = token or os.getenv("HF_TOKEN")`
71	`if not self._token:`
72	`raise ValueError("HF_TOKEN not set")`
73
74	`self._client = InferenceClient(token=self._token)`
75	`self._last_usage = {}`
76
77	`def chat(`
78	`self,`
79	`messages: list[dict],`
80	`max_tokens: int = 4096,`
81	`temperature: float = 0.7,`
82	`model: Optional[str] = None,`
83	`) -> str:`
84	`model = model or "meta-llama/Llama-3.1-70B-Instruct"`
85	`if model.startswith("hf/"):`
86	`model = model[len("hf/") :]`
87
88	`response = self._client.chat_completion(`
89	`model=model,`
90	`messages=messages,`
91	`max_tokens=max_tokens,`
92	`temperature=temperature,`
93	`)`
94
95	`usage = getattr(response, "usage", None)`
96	`self._last_usage = {`
97	`"input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0,`
98	`"output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0,`
99	`}`
100	`return response.choices[0].message.content or ""`
101
102	`def analyze_image(`
103	`self,`
104	`image_bytes: bytes,`
105	`prompt: str,`
106	`max_tokens: int = 4096,`
107	`model: Optional[str] = None,`
108	`) -> str:`
109	`model = model or "llava-hf/llava-v1.6-mistral-7b-hf"`
110	`if model.startswith("hf/"):`
111	`model = model[len("hf/") :]`
112
113	`b64 = base64.b64encode(image_bytes).decode()`
114
115	`response = self._client.chat_completion(`
116	`model=model,`
117	`messages=[`
118	`{`
119	`"role": "user",`
120	`"content": [`
121	`{"type": "text", "text": prompt},`
122	`{`
123	`"type": "image_url",`
124	`"image_url": {"url": f"data:image/jpeg;base64,{b64}"},`
125	`},`
126	`],`
127	`}`
128	`],`
129	`max_tokens=max_tokens,`
130	`)`
131
132	`usage = getattr(response, "usage", None)`
133	`self._last_usage = {`
134	`"input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0,`
135	`"output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0,`
136	`}`
137	`return response.choices[0].message.content or ""`
138
139	`def transcribe_audio(`
140	`self,`
141	`audio_path: str \| Path,`
142	`language: Optional[str] = None,`
143	`model: Optional[str] = None,`
144	`) -> dict:`
145	`model = model or "openai/whisper-large-v3"`
146	`if model.startswith("hf/"):`
147	`model = model[len("hf/") :]`
148
149	`audio_path = Path(audio_path)`
150	`audio_bytes = audio_path.read_bytes()`
151
152	`result = self._client.automatic_speech_recognition(`
153	`audio=audio_bytes,`
154	`model=model,`
155	`)`
156
157	`text = result.text if hasattr(result, "text") else str(result)`
158
159	`self._last_usage = {`
160	`"input_tokens": 0,`
161	`"output_tokens": 0,`
162	`}`
163
164	`return {`
165	`"text": text,`
166	`"segments": [],`
167	`"language": language,`
168	`"duration": None,`
169	`"provider": "huggingface",`
170	`"model": model,`
171	`}`
172
173	`def list_models(self) -> list[ModelInfo]:`
174	`return list(_HF_MODELS)`
175
176
177	`ProviderRegistry.register(`
178	`name="huggingface",`
179	`provider_class=HuggingFaceProvider,`
180	`env_var="HF_TOKEN",`
181	`model_prefixes=["hf/"],`
182	`default_models={`
183	`"chat": "meta-llama/Llama-3.1-70B-Instruct",`
184	`"vision": "llava-hf/llava-v1.6-mistral-7b-hf",`
185	`"audio": "openai/whisper-large-v3",`
186	`},`
187	`)`
188

PlanOpticon

Keyboard Shortcuts