Hugoifier

feat: direct HTML extraction for raw HTML path (no AI) Apply the same philosophy as the Next.js rendered capture: use the actual HTML directly instead of asking AI to reinterpret it. - hugoify_html() now extracts head/body from real HTML, no AI call - Preserves all content, classes, SVGs, styles exactly as-is - Copies ALL static assets from HTML theme dir (not just known names) - Zero latency, zero token cost, zero content loss - AI multi-pass fallback still available for Next.js without dev server

lmata 2026-03-17 14:23 trunk

Commit 1efd3dd395b9c46745d63a7c06b620776031023296314ae24cbcfa17b964ef08

Parent 6b32ba2bba58cf0…

2 files changed +12 -7 +54 -39

~ hugoifier/utils/complete.py ~ hugoifier/utils/hugoify.py

M hugoifier/utils/complete.py

+12 -7

		--- hugoifier/utils/complete.py
		+++ hugoifier/utils/complete.py
		@@ -208,13 +208,13 @@
208	208	if output_dir is None:
209	209	output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210	210
211	211	logging.info(f"Converting raw HTML theme: {theme_name}")
212	212
213		- # Use AI to convert the main HTML file to Hugo layouts
	213	+ # Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation
214	214	main_html = _pick_main_html(html_files)
215		- logging.info(f"Converting {main_html} ...")
	215	+ logging.info(f"Extracting {main_html} ...")
216	216	hugo_layouts = hugoify_html(main_html)
217	217
218	218	os.makedirs(output_dir, exist_ok=True)
219	219
220	220	# Write converted layouts
		@@ -226,15 +226,20 @@
226	226	dest = os.path.join(theme_layouts_dir, filename)
227	227	os.makedirs(os.path.dirname(dest), exist_ok=True)
228	228	with open(dest, 'w') as f:
229	229	f.write(content)
230	230
231		- # Copy CSS/JS/images
232		- for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'):
233		- src = os.path.join(input_path, ext_dir)
234		- if os.path.isdir(src):
235		- _copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir))
	231	+ # Copy ALL static assets from the HTML theme directory
	232	+ theme_static = os.path.join(output_dir, 'themes', theme_name, 'static')
	233	+ for item in os.listdir(input_path):
	234	+ src = os.path.join(input_path, item)
	235	+ if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'):
	236	+ _copy_dir(src, os.path.join(theme_static, item))
	237	+ elif os.path.isfile(src) and not item.endswith('.html'):
	238	+ # Copy non-HTML files (images, fonts, etc.) to static root
	239	+ os.makedirs(theme_static, exist_ok=True)
	240	+ shutil.copy2(src, os.path.join(theme_static, item))
236	241
237	242	_write_minimal_hugo_toml(output_dir, theme_name)
238	243
239	244	# Create minimal content
240	245	content_dir = os.path.join(output_dir, 'content')
241	246

	--- hugoifier/utils/complete.py
	+++ hugoifier/utils/complete.py
	@@ -208,13 +208,13 @@
208	if output_dir is None:
209	output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210
211	logging.info(f"Converting raw HTML theme: {theme_name}")
212
213	# Use AI to convert the main HTML file to Hugo layouts
214	main_html = _pick_main_html(html_files)
215	logging.info(f"Converting {main_html} ...")
216	hugo_layouts = hugoify_html(main_html)
217
218	os.makedirs(output_dir, exist_ok=True)
219
220	# Write converted layouts
	@@ -226,15 +226,20 @@
226	dest = os.path.join(theme_layouts_dir, filename)
227	os.makedirs(os.path.dirname(dest), exist_ok=True)
228	with open(dest, 'w') as f:
229	f.write(content)
230
231	# Copy CSS/JS/images
232	for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'):
233	src = os.path.join(input_path, ext_dir)
234	if os.path.isdir(src):
235	_copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir))





236
237	_write_minimal_hugo_toml(output_dir, theme_name)
238
239	# Create minimal content
240	content_dir = os.path.join(output_dir, 'content')
241

	--- hugoifier/utils/complete.py
	+++ hugoifier/utils/complete.py
	@@ -208,13 +208,13 @@
208	if output_dir is None:
209	output_dir = str(Path(__file__).parents[2] / 'output' / theme_name)
210
211	logging.info(f"Converting raw HTML theme: {theme_name}")
212
213	# Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation
214	main_html = _pick_main_html(html_files)
215	logging.info(f"Extracting {main_html} ...")
216	hugo_layouts = hugoify_html(main_html)
217
218	os.makedirs(output_dir, exist_ok=True)
219
220	# Write converted layouts
	@@ -226,15 +226,20 @@
226	dest = os.path.join(theme_layouts_dir, filename)
227	os.makedirs(os.path.dirname(dest), exist_ok=True)
228	with open(dest, 'w') as f:
229	f.write(content)
230
231	# Copy ALL static assets from the HTML theme directory
232	theme_static = os.path.join(output_dir, 'themes', theme_name, 'static')
233	for item in os.listdir(input_path):
234	src = os.path.join(input_path, item)
235	if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'):
236	_copy_dir(src, os.path.join(theme_static, item))
237	elif os.path.isfile(src) and not item.endswith('.html'):
238	# Copy non-HTML files (images, fonts, etc.) to static root
239	os.makedirs(theme_static, exist_ok=True)
240	shutil.copy2(src, os.path.join(theme_static, item))
241
242	_write_minimal_hugo_toml(output_dir, theme_name)
243
244	# Create minimal content
245	content_dir = os.path.join(output_dir, 'content')
246

M hugoifier/utils/hugoify.py

+54 -39

		--- hugoifier/utils/hugoify.py
		+++ hugoifier/utils/hugoify.py
		@@ -28,55 +28,70 @@
28	28
29	29	def hugoify_html(html_path: str) -> dict:
30	30	"""
31	31	Convert a raw HTML file to a set of Hugo layout files.
32	32
33		- Returns dict mapping relative layout paths to their content, e.g.:
34		- {
35		- "_default/baseof.html": "<!DOCTYPE html>...",
36		- "partials/header.html": "<header>...",
37		- "partials/footer.html": "<footer>...",
38		- "index.html": "{{ define \"main\" }}...",
39		- }
	33	+ Uses direct HTML extraction (no AI) to preserve content exactly as-is.
	34	+ Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content).
	35	+
	36	+ Returns dict mapping relative layout paths to their content.
40	37	"""
41	38	logging.info(f"Hugoifying {html_path} ...")
42	39
43	40	with open(html_path, 'r', errors='replace') as f:
44	41	html = f.read()
45	42
46		- # Truncate very large files to avoid token limits
47		- if len(html) > 30000:
48		- logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis")
49		- html = html[:30000]
50		-
51		- prompt = f"""Convert the following HTML file into Hugo layout files.
52		-
53		-Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content.
54		-
55		-Required keys to produce:
56		-- "_default/baseof.html" — base template with blocks for head, header, main, footer
57		-- "partials/header.html" — site header/nav extracted as partial
58		-- "partials/footer.html" — footer extracted as partial
59		-- "index.html" — homepage using {{ define "main" }} ... {{ end }}
60		-
61		-Rules:
62		-- Replace hardcoded page titles with {{ .Title }}
63		-- Replace hardcoded site name with {{ .Site.Title }}
64		-- Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }}
65		-- Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }}
66		-- Replace blog post lists with {{ range .Pages }} ... {{ end }}
67		-- Replace copyright year with {{ now.Year }}
68		-- Keep all CSS classes and HTML structure intact
69		-- Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html
70		-
71		-HTML to convert:
72		-{html}
73		-
74		-Return ONLY a valid JSON object, no explanation."""
75		-
76		- response = call_ai(prompt, SYSTEM)
77		- return _parse_layout_json(response)
	43	+ logging.info(f"Read {len(html)} chars from {html_path}")
	44	+
	45	+ # Extract <head> content (CSS links, meta, fonts, etc.)
	46	+ head_extras = _extract_head_content(html)
	47	+
	48	+ # Extract and rewrite CSS/JS paths to be relative to Hugo static/
	49	+ css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>',
	50	+ html, re.DOTALL \| re.IGNORECASE)
	51	+ js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]>.?</script>',
	52	+ html, re.DOTALL)
	53	+
	54	+ # Extract <body> content
	55	+ body_match = re.search(r'<body[^>]>(.?)</body>', html, re.DOTALL)
	56	+ body_content = body_match.group(1).strip() if body_match else html
	57	+
	58	+ # Extract body attributes (class, style, etc.)
	59	+ body_attrs_match = re.search(r'<body([^>]*)>', html)
	60	+ body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else ''
	61	+
	62	+ # Build baseof.html preserving the original <head> structure
	63	+ head_match = re.search(r'<head[^>]>(.?)</head>', html, re.DOTALL)
	64	+ if head_match:
	65	+ head_content = head_match.group(1).strip()
	66	+ # Replace hardcoded <title> with Hugo template
	67	+ head_content = re.sub(
	68	+ r'<title>[^<]*</title>',
	69	+ '<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} \| {{ .Site.Title }}{{ end }}</title>',
	70	+ head_content
	71	+ )
	72	+ baseof = f'''<!DOCTYPE html>
	73	+<html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}">
	74	+<head>
	75	+{head_content}
	76	+</head>
	77	+<body{" " + body_attrs if body_attrs else ""}>
	78	+ {{{{- block "main" . }}}}{{{{- end }}}}
	79	+</body>
	80	+</html>'''
	81	+ else:
	82	+ baseof = _fallback_baseof()
	83	+
	84	+ index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}'
	85	+
	86	+ layouts = {
	87	+ "_default/baseof.html": baseof,
	88	+ "index.html": index_html,
	89	+ }
	90	+
	91	+ logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)")
	92	+ return layouts
78	93
79	94
80	95	def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
81	96	"""
82	97	Convert a Next.js app to a set of Hugo layout files.
83	98

	--- hugoifier/utils/hugoify.py
	+++ hugoifier/utils/hugoify.py
	@@ -28,55 +28,70 @@
28
29	def hugoify_html(html_path: str) -> dict:
30	"""
31	Convert a raw HTML file to a set of Hugo layout files.
32
33	Returns dict mapping relative layout paths to their content, e.g.:
34	{
35	"_default/baseof.html": "<!DOCTYPE html>...",
36	"partials/header.html": "<header>...",
37	"partials/footer.html": "<footer>...",
38	"index.html": "{{ define \"main\" }}...",
39	}
40	"""
41	logging.info(f"Hugoifying {html_path} ...")
42
43	with open(html_path, 'r', errors='replace') as f:
44	html = f.read()
45
46	# Truncate very large files to avoid token limits
47	if len(html) > 30000:
48	logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis")
49	html = html[:30000]
50
51	prompt = f"""Convert the following HTML file into Hugo layout files.
52
53	Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content.
54
55	Required keys to produce:
56	- "_default/baseof.html" — base template with blocks for head, header, main, footer
57	- "partials/header.html" — site header/nav extracted as partial
58	- "partials/footer.html" — footer extracted as partial
59	- "index.html" — homepage using {{ define "main" }} ... {{ end }}
60
61	Rules:
62	- Replace hardcoded page titles with {{ .Title }}
63	- Replace hardcoded site name with {{ .Site.Title }}
64	- Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }}
65	- Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }}
66	- Replace blog post lists with {{ range .Pages }} ... {{ end }}
67	- Replace copyright year with {{ now.Year }}
68	- Keep all CSS classes and HTML structure intact
69	- Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html
70
71	HTML to convert:
72	{html}
73
74	Return ONLY a valid JSON object, no explanation."""
75
76	response = call_ai(prompt, SYSTEM)
77	return _parse_layout_json(response)


















78
79
80	def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
81	"""
82	Convert a Next.js app to a set of Hugo layout files.
83

	--- hugoifier/utils/hugoify.py
	+++ hugoifier/utils/hugoify.py
	@@ -28,55 +28,70 @@
28
29	def hugoify_html(html_path: str) -> dict:
30	"""
31	Convert a raw HTML file to a set of Hugo layout files.
32
33	Uses direct HTML extraction (no AI) to preserve content exactly as-is.
34	Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content).
35
36	Returns dict mapping relative layout paths to their content.



37	"""
38	logging.info(f"Hugoifying {html_path} ...")
39
40	with open(html_path, 'r', errors='replace') as f:
41	html = f.read()
42
43	logging.info(f"Read {len(html)} chars from {html_path}")
44
45	# Extract <head> content (CSS links, meta, fonts, etc.)
46	head_extras = _extract_head_content(html)
47
48	# Extract and rewrite CSS/JS paths to be relative to Hugo static/
49	css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>',
50	html, re.DOTALL \| re.IGNORECASE)
51	js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]>.?</script>',
52	html, re.DOTALL)
53
54	# Extract <body> content
55	body_match = re.search(r'<body[^>]>(.?)</body>', html, re.DOTALL)
56	body_content = body_match.group(1).strip() if body_match else html
57
58	# Extract body attributes (class, style, etc.)
59	body_attrs_match = re.search(r'<body([^>]*)>', html)
60	body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else ''
61
62	# Build baseof.html preserving the original <head> structure
63	head_match = re.search(r'<head[^>]>(.?)</head>', html, re.DOTALL)
64	if head_match:
65	head_content = head_match.group(1).strip()
66	# Replace hardcoded <title> with Hugo template
67	head_content = re.sub(
68	r'<title>[^<]*</title>',
69	'<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} \| {{ .Site.Title }}{{ end }}</title>',
70	head_content
71	)
72	baseof = f'''<!DOCTYPE html>
73	<html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}">
74	<head>
75	{head_content}
76	</head>
77	<body{" " + body_attrs if body_attrs else ""}>
78	{{{{- block "main" . }}}}{{{{- end }}}}
79	</body>
80	</html>'''
81	else:
82	baseof = _fallback_baseof()
83
84	index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}'
85
86	layouts = {
87	"_default/baseof.html": baseof,
88	"index.html": index_html,
89	}
90
91	logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)")
92	return layouts
93
94
95	def hugoify_nextjs(info: dict, dev_url: str = None) -> dict:
96	"""
97	Convert a Next.js app to a set of Hugo layout files.
98

Hugoifier

Keyboard Shortcuts