Hugoifier

feat: direct HTML extraction for raw HTML path (no AI) Apply the same philosophy as the Next.js rendered capture: use the actual HTML directly instead of asking AI to reinterpret it. - hugoify_html() now extracts head/body from real HTML, no AI call - Preserves all content, classes, SVGs, styles exactly as-is - Copies ALL static assets from HTML theme dir (not just known names) - Zero latency, zero token cost, zero content loss - AI multi-pass fallback still available for Next.js without dev server

lmata 2026-03-17 14:23 trunk
Commit 1efd3dd395b9c46745d63a7c06b620776031023296314ae24cbcfa17b964ef08
--- hugoifier/utils/complete.py
+++ hugoifier/utils/complete.py
@@ -208,13 +208,13 @@
208208
if output_dir is None:
209209
output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210210
211211
logging.info(f"Converting raw HTML theme: {theme_name}")
212212
213
- # Use AI to convert the main HTML file to Hugo layouts
213
+ # Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation
214214
main_html = _pick_main_html(html_files)
215
- logging.info(f"Converting {main_html} ...")
215
+ logging.info(f"Extracting {main_html} ...")
216216
hugo_layouts = hugoify_html(main_html)
217217
218218
os.makedirs(output_dir, exist_ok=True)
219219
220220
# Write converted layouts
@@ -226,15 +226,20 @@
226226
dest = os.path.join(theme_layouts_dir, filename)
227227
os.makedirs(os.path.dirname(dest), exist_ok=True)
228228
with open(dest, 'w') as f:
229229
f.write(content)
230230
231
- # Copy CSS/JS/images
232
- for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'):
233
- src = os.path.join(input_path, ext_dir)
234
- if os.path.isdir(src):
235
- _copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir))
231
+ # Copy ALL static assets from the HTML theme directory
232
+ theme_static = os.path.join(output_dir, 'themes', theme_name, 'static')
233
+ for item in os.listdir(input_path):
234
+ src = os.path.join(input_path, item)
235
+ if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'):
236
+ _copy_dir(src, os.path.join(theme_static, item))
237
+ elif os.path.isfile(src) and not item.endswith('.html'):
238
+ # Copy non-HTML files (images, fonts, etc.) to static root
239
+ os.makedirs(theme_static, exist_ok=True)
240
+ shutil.copy2(src, os.path.join(theme_static, item))
236241
237242
_write_minimal_hugo_toml(output_dir, theme_name)
238243
239244
# Create minimal content
240245
content_dir = os.path.join(output_dir, 'content')
241246
--- hugoifier/utils/complete.py
+++ hugoifier/utils/complete.py
@@ -208,13 +208,13 @@
208 if output_dir is None:
209 output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210
211 logging.info(f"Converting raw HTML theme: {theme_name}")
212
213 # Use AI to convert the main HTML file to Hugo layouts
214 main_html = _pick_main_html(html_files)
215 logging.info(f"Converting {main_html} ...")
216 hugo_layouts = hugoify_html(main_html)
217
218 os.makedirs(output_dir, exist_ok=True)
219
220 # Write converted layouts
@@ -226,15 +226,20 @@
226 dest = os.path.join(theme_layouts_dir, filename)
227 os.makedirs(os.path.dirname(dest), exist_ok=True)
228 with open(dest, 'w') as f:
229 f.write(content)
230
231 # Copy CSS/JS/images
232 for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'):
233 src = os.path.join(input_path, ext_dir)
234 if os.path.isdir(src):
235 _copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir))
 
 
 
 
 
236
237 _write_minimal_hugo_toml(output_dir, theme_name)
238
239 # Create minimal content
240 content_dir = os.path.join(output_dir, 'content')
241
--- hugoifier/utils/complete.py
+++ hugoifier/utils/complete.py
@@ -208,13 +208,13 @@
208 if output_dir is None:
209 output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210
211 logging.info(f"Converting raw HTML theme: {theme_name}")
212
213 # Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation
214 main_html = _pick_main_html(html_files)
215 logging.info(f"Extracting {main_html} ...")
216 hugo_layouts = hugoify_html(main_html)
217
218 os.makedirs(output_dir, exist_ok=True)
219
220 # Write converted layouts
@@ -226,15 +226,20 @@
226 dest = os.path.join(theme_layouts_dir, filename)
227 os.makedirs(os.path.dirname(dest), exist_ok=True)
228 with open(dest, 'w') as f:
229 f.write(content)
230
231 # Copy ALL static assets from the HTML theme directory
232 theme_static = os.path.join(output_dir, 'themes', theme_name, 'static')
233 for item in os.listdir(input_path):
234 src = os.path.join(input_path, item)
235 if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'):
236 _copy_dir(src, os.path.join(theme_static, item))
237 elif os.path.isfile(src) and not item.endswith('.html'):
238 # Copy non-HTML files (images, fonts, etc.) to static root
239 os.makedirs(theme_static, exist_ok=True)
240 shutil.copy2(src, os.path.join(theme_static, item))
241
242 _write_minimal_hugo_toml(output_dir, theme_name)
243
244 # Create minimal content
245 content_dir = os.path.join(output_dir, 'content')
246
--- hugoifier/utils/hugoify.py
+++ hugoifier/utils/hugoify.py
@@ -28,55 +28,70 @@
2828
2929
def hugoify_html(html_path: str) -> dict:
3030
"""
3131
Convert a raw HTML file to a set of Hugo layout files.
3232
33
- Returns dict mapping relative layout paths to their content, e.g.:
34
- {
35
- "_default/baseof.html": "<!DOCTYPE html>...",
36
- "partials/header.html": "<header>...",
37
- "partials/footer.html": "<footer>...",
38
- "index.html": "{{ define \"main\" }}...",
39
- }
33
+ Uses direct HTML extraction (no AI) to preserve content exactly as-is.
34
+ Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content).
35
+
36
+ Returns dict mapping relative layout paths to their content.
4037
"""
4138
logging.info(f"Hugoifying {html_path} ...")
4239
4340
with open(html_path, 'r', errors='replace') as f:
4441
html = f.read()
4542
46
- # Truncate very large files to avoid token limits
47
- if len(html) > 30000:
48
- logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis")
49
- html = html[:30000]
50
-
51
- prompt = f"""Convert the following HTML file into Hugo layout files.
52
-
53
-Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content.
54
-
55
-Required keys to produce:
56
-- "_default/baseof.html" — base template with blocks for head, header, main, footer
57
-- "partials/header.html" — site header/nav extracted as partial
58
-- "partials/footer.html" — footer extracted as partial
59
-- "index.html" — homepage using {{ define "main" }} ... {{ end }}
60
-
61
-Rules:
62
-- Replace hardcoded page titles with {{ .Title }}
63
-- Replace hardcoded site name with {{ .Site.Title }}
64
-- Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }}
65
-- Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }}
66
-- Replace blog post lists with {{ range .Pages }} ... {{ end }}
67
-- Replace copyright year with {{ now.Year }}
68
-- Keep all CSS classes and HTML structure intact
69
-- Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html
70
-
71
-HTML to convert:
72
-{html}
73
-
74
-Return ONLY a valid JSON object, no explanation."""
75
-
76
- response = call_ai(prompt, SYSTEM)
77
- return _parse_layout_json(response)
43
+ logging.info(f"Read {len(html)} chars from {html_path}")
44
+
45
+ # Extract <head> content (CSS links, meta, fonts, etc.)
46
+ head_extras = _extract_head_content(html)
47
+
48
+ # Extract and rewrite CSS/JS paths to be relative to Hugo static/
49
+ css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>',
50
+ html, re.DOTALL | re.IGNORECASE)
51
+ js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]*>.*?</script>',
52
+ html, re.DOTALL)
53
+
54
+ # Extract <body> content
55
+ body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL)
56
+ body_content = body_match.group(1).strip() if body_match else html
57
+
58
+ # Extract body attributes (class, style, etc.)
59
+ body_attrs_match = re.search(r'<body([^>]*)>', html)
60
+ body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else ''
61
+
62
+ # Build baseof.html preserving the original <head> structure
63
+ head_match = re.search(r'<head[^>]*>(.*?)</head>', html, re.DOTALL)
64
+ if head_match:
65
+ head_content = head_match.group(1).strip()
66
+ # Replace hardcoded <title> with Hugo template
67
+ head_content = re.sub(
68
+ r'<title>[^<]*</title>',
69
+ '<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} | {{ .Site.Title }}{{ end }}</title>',
70
+ head_content
71
+ )
72
+ baseof = f'''<!DOCTYPE html>
73
+<html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}">
74
+<head>
75
+{head_content}
76
+</head>
77
+<body{" " + body_attrs if body_attrs else ""}>
78
+ {{{{- block "main" . }}}}{{{{- end }}}}
79
+</body>
80
+</html>'''
81
+ else:
82
+ baseof = _fallback_baseof()
83
+
84
+ index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}'
85
+
86
+ layouts = {
87
+ "_default/baseof.html": baseof,
88
+ "index.html": index_html,
89
+ }
90
+
91
+ logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)")
92
+ return layouts
7893
7994
8095
def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
8196
"""
8297
Convert a Next.js app to a set of Hugo layout files.
8398
--- hugoifier/utils/hugoify.py
+++ hugoifier/utils/hugoify.py
@@ -28,55 +28,70 @@
28
29 def hugoify_html(html_path: str) -> dict:
30 """
31 Convert a raw HTML file to a set of Hugo layout files.
32
33 Returns dict mapping relative layout paths to their content, e.g.:
34 {
35 "_default/baseof.html": "<!DOCTYPE html>...",
36 "partials/header.html": "<header>...",
37 "partials/footer.html": "<footer>...",
38 "index.html": "{{ define \"main\" }}...",
39 }
40 """
41 logging.info(f"Hugoifying {html_path} ...")
42
43 with open(html_path, 'r', errors='replace') as f:
44 html = f.read()
45
46 # Truncate very large files to avoid token limits
47 if len(html) > 30000:
48 logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis")
49 html = html[:30000]
50
51 prompt = f"""Convert the following HTML file into Hugo layout files.
52
53 Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content.
54
55 Required keys to produce:
56 - "_default/baseof.html" — base template with blocks for head, header, main, footer
57 - "partials/header.html" — site header/nav extracted as partial
58 - "partials/footer.html" — footer extracted as partial
59 - "index.html" — homepage using {{ define "main" }} ... {{ end }}
60
61 Rules:
62 - Replace hardcoded page titles with {{ .Title }}
63 - Replace hardcoded site name with {{ .Site.Title }}
64 - Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }}
65 - Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }}
66 - Replace blog post lists with {{ range .Pages }} ... {{ end }}
67 - Replace copyright year with {{ now.Year }}
68 - Keep all CSS classes and HTML structure intact
69 - Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html
70
71 HTML to convert:
72 {html}
73
74 Return ONLY a valid JSON object, no explanation."""
75
76 response = call_ai(prompt, SYSTEM)
77 return _parse_layout_json(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
79
80 def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
81 """
82 Convert a Next.js app to a set of Hugo layout files.
83
--- hugoifier/utils/hugoify.py
+++ hugoifier/utils/hugoify.py
@@ -28,55 +28,70 @@
28
29 def hugoify_html(html_path: str) -> dict:
30 """
31 Convert a raw HTML file to a set of Hugo layout files.
32
33 Uses direct HTML extraction (no AI) to preserve content exactly as-is.
34 Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content).
35
36 Returns dict mapping relative layout paths to their content.
 
 
 
37 """
38 logging.info(f"Hugoifying {html_path} ...")
39
40 with open(html_path, 'r', errors='replace') as f:
41 html = f.read()
42
43 logging.info(f"Read {len(html)} chars from {html_path}")
44
45 # Extract <head> content (CSS links, meta, fonts, etc.)
46 head_extras = _extract_head_content(html)
47
48 # Extract and rewrite CSS/JS paths to be relative to Hugo static/
49 css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>',
50 html, re.DOTALL | re.IGNORECASE)
51 js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]*>.*?</script>',
52 html, re.DOTALL)
53
54 # Extract <body> content
55 body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL)
56 body_content = body_match.group(1).strip() if body_match else html
57
58 # Extract body attributes (class, style, etc.)
59 body_attrs_match = re.search(r'<body([^>]*)>', html)
60 body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else ''
61
62 # Build baseof.html preserving the original <head> structure
63 head_match = re.search(r'<head[^>]*>(.*?)</head>', html, re.DOTALL)
64 if head_match:
65 head_content = head_match.group(1).strip()
66 # Replace hardcoded <title> with Hugo template
67 head_content = re.sub(
68 r'<title>[^<]*</title>',
69 '<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} | {{ .Site.Title }}{{ end }}</title>',
70 head_content
71 )
72 baseof = f'''<!DOCTYPE html>
73 <html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}">
74 <head>
75 {head_content}
76 </head>
77 <body{" " + body_attrs if body_attrs else ""}>
78 {{{{- block "main" . }}}}{{{{- end }}}}
79 </body>
80 </html>'''
81 else:
82 baseof = _fallback_baseof()
83
84 index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}'
85
86 layouts = {
87 "_default/baseof.html": baseof,
88 "index.html": index_html,
89 }
90
91 logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)")
92 return layouts
93
94
95 def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
96 """
97 Convert a Next.js app to a set of Hugo layout files.
98

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button