Hugoifier
feat: direct HTML extraction for raw HTML path (no AI) Apply the same philosophy as the Next.js rendered capture: use the actual HTML directly instead of asking AI to reinterpret it. - hugoify_html() now extracts head/body from real HTML, no AI call - Preserves all content, classes, SVGs, styles exactly as-is - Copies ALL static assets from HTML theme dir (not just known names) - Zero latency, zero token cost, zero content loss - AI multi-pass fallback still available for Next.js without dev server
Commit
1efd3dd395b9c46745d63a7c06b620776031023296314ae24cbcfa17b964ef08
Parent
6b32ba2bba58cf0…
2 files changed
+12
-7
+54
-39
+12
-7
| --- hugoifier/utils/complete.py | ||
| +++ hugoifier/utils/complete.py | ||
| @@ -208,13 +208,13 @@ | ||
| 208 | 208 | if output_dir is None: |
| 209 | 209 | output_dir = str(Path(__file__).parents[2] / 'output' / theme_name) |
| 210 | 210 | |
| 211 | 211 | logging.info(f"Converting raw HTML theme: {theme_name}") |
| 212 | 212 | |
| 213 | - # Use AI to convert the main HTML file to Hugo layouts | |
| 213 | + # Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation | |
| 214 | 214 | main_html = _pick_main_html(html_files) |
| 215 | - logging.info(f"Converting {main_html} ...") | |
| 215 | + logging.info(f"Extracting {main_html} ...") | |
| 216 | 216 | hugo_layouts = hugoify_html(main_html) |
| 217 | 217 | |
| 218 | 218 | os.makedirs(output_dir, exist_ok=True) |
| 219 | 219 | |
| 220 | 220 | # Write converted layouts |
| @@ -226,15 +226,20 @@ | ||
| 226 | 226 | dest = os.path.join(theme_layouts_dir, filename) |
| 227 | 227 | os.makedirs(os.path.dirname(dest), exist_ok=True) |
| 228 | 228 | with open(dest, 'w') as f: |
| 229 | 229 | f.write(content) |
| 230 | 230 | |
| 231 | - # Copy CSS/JS/images | |
| 232 | - for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'): | |
| 233 | - src = os.path.join(input_path, ext_dir) | |
| 234 | - if os.path.isdir(src): | |
| 235 | - _copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir)) | |
| 231 | + # Copy ALL static assets from the HTML theme directory | |
| 232 | + theme_static = os.path.join(output_dir, 'themes', theme_name, 'static') | |
| 233 | + for item in os.listdir(input_path): | |
| 234 | + src = os.path.join(input_path, item) | |
| 235 | + if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'): | |
| 236 | + _copy_dir(src, os.path.join(theme_static, item)) | |
| 237 | + elif os.path.isfile(src) and not item.endswith('.html'): | |
| 238 | + # Copy non-HTML files (images, fonts, etc.) to static root | |
| 239 | + os.makedirs(theme_static, exist_ok=True) | |
| 240 | + shutil.copy2(src, os.path.join(theme_static, item)) | |
| 236 | 241 | |
| 237 | 242 | _write_minimal_hugo_toml(output_dir, theme_name) |
| 238 | 243 | |
| 239 | 244 | # Create minimal content |
| 240 | 245 | content_dir = os.path.join(output_dir, 'content') |
| 241 | 246 |
| --- hugoifier/utils/complete.py | |
| +++ hugoifier/utils/complete.py | |
| @@ -208,13 +208,13 @@ | |
| 208 | if output_dir is None: |
| 209 | output_dir = str(Path(__file__).parents[2] / 'output' / theme_name) |
| 210 | |
| 211 | logging.info(f"Converting raw HTML theme: {theme_name}") |
| 212 | |
| 213 | # Use AI to convert the main HTML file to Hugo layouts |
| 214 | main_html = _pick_main_html(html_files) |
| 215 | logging.info(f"Converting {main_html} ...") |
| 216 | hugo_layouts = hugoify_html(main_html) |
| 217 | |
| 218 | os.makedirs(output_dir, exist_ok=True) |
| 219 | |
| 220 | # Write converted layouts |
| @@ -226,15 +226,20 @@ | |
| 226 | dest = os.path.join(theme_layouts_dir, filename) |
| 227 | os.makedirs(os.path.dirname(dest), exist_ok=True) |
| 228 | with open(dest, 'w') as f: |
| 229 | f.write(content) |
| 230 | |
| 231 | # Copy CSS/JS/images |
| 232 | for ext_dir in ('css', 'js', 'images', 'img', 'assets', 'fonts'): |
| 233 | src = os.path.join(input_path, ext_dir) |
| 234 | if os.path.isdir(src): |
| 235 | _copy_dir(src, os.path.join(output_dir, 'themes', theme_name, 'static', ext_dir)) |
| 236 | |
| 237 | _write_minimal_hugo_toml(output_dir, theme_name) |
| 238 | |
| 239 | # Create minimal content |
| 240 | content_dir = os.path.join(output_dir, 'content') |
| 241 |
| --- hugoifier/utils/complete.py | |
| +++ hugoifier/utils/complete.py | |
| @@ -208,13 +208,13 @@ | |
| 208 | if output_dir is None: |
| 209 | output_dir = str(Path(__file__).parents[2] / 'output' / theme_name) |
| 210 | |
| 211 | logging.info(f"Converting raw HTML theme: {theme_name}") |
| 212 | |
| 213 | # Direct HTML extraction — use the actual HTML as-is, no AI reinterpretation |
| 214 | main_html = _pick_main_html(html_files) |
| 215 | logging.info(f"Extracting {main_html} ...") |
| 216 | hugo_layouts = hugoify_html(main_html) |
| 217 | |
| 218 | os.makedirs(output_dir, exist_ok=True) |
| 219 | |
| 220 | # Write converted layouts |
| @@ -226,15 +226,20 @@ | |
| 226 | dest = os.path.join(theme_layouts_dir, filename) |
| 227 | os.makedirs(os.path.dirname(dest), exist_ok=True) |
| 228 | with open(dest, 'w') as f: |
| 229 | f.write(content) |
| 230 | |
| 231 | # Copy ALL static assets from the HTML theme directory |
| 232 | theme_static = os.path.join(output_dir, 'themes', theme_name, 'static') |
| 233 | for item in os.listdir(input_path): |
| 234 | src = os.path.join(input_path, item) |
| 235 | if os.path.isdir(src) and item not in ('__MACOSX', '.git', 'node_modules'): |
| 236 | _copy_dir(src, os.path.join(theme_static, item)) |
| 237 | elif os.path.isfile(src) and not item.endswith('.html'): |
| 238 | # Copy non-HTML files (images, fonts, etc.) to static root |
| 239 | os.makedirs(theme_static, exist_ok=True) |
| 240 | shutil.copy2(src, os.path.join(theme_static, item)) |
| 241 | |
| 242 | _write_minimal_hugo_toml(output_dir, theme_name) |
| 243 | |
| 244 | # Create minimal content |
| 245 | content_dir = os.path.join(output_dir, 'content') |
| 246 |
+54
-39
| --- hugoifier/utils/hugoify.py | ||
| +++ hugoifier/utils/hugoify.py | ||
| @@ -28,55 +28,70 @@ | ||
| 28 | 28 | |
| 29 | 29 | def hugoify_html(html_path: str) -> dict: |
| 30 | 30 | """ |
| 31 | 31 | Convert a raw HTML file to a set of Hugo layout files. |
| 32 | 32 | |
| 33 | - Returns dict mapping relative layout paths to their content, e.g.: | |
| 34 | - { | |
| 35 | - "_default/baseof.html": "<!DOCTYPE html>...", | |
| 36 | - "partials/header.html": "<header>...", | |
| 37 | - "partials/footer.html": "<footer>...", | |
| 38 | - "index.html": "{{ define \"main\" }}...", | |
| 39 | - } | |
| 33 | + Uses direct HTML extraction (no AI) to preserve content exactly as-is. | |
| 34 | + Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content). | |
| 35 | + | |
| 36 | + Returns dict mapping relative layout paths to their content. | |
| 40 | 37 | """ |
| 41 | 38 | logging.info(f"Hugoifying {html_path} ...") |
| 42 | 39 | |
| 43 | 40 | with open(html_path, 'r', errors='replace') as f: |
| 44 | 41 | html = f.read() |
| 45 | 42 | |
| 46 | - # Truncate very large files to avoid token limits | |
| 47 | - if len(html) > 30000: | |
| 48 | - logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis") | |
| 49 | - html = html[:30000] | |
| 50 | - | |
| 51 | - prompt = f"""Convert the following HTML file into Hugo layout files. | |
| 52 | - | |
| 53 | -Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content. | |
| 54 | - | |
| 55 | -Required keys to produce: | |
| 56 | -- "_default/baseof.html" — base template with blocks for head, header, main, footer | |
| 57 | -- "partials/header.html" — site header/nav extracted as partial | |
| 58 | -- "partials/footer.html" — footer extracted as partial | |
| 59 | -- "index.html" — homepage using {{ define "main" }} ... {{ end }} | |
| 60 | - | |
| 61 | -Rules: | |
| 62 | -- Replace hardcoded page titles with {{ .Title }} | |
| 63 | -- Replace hardcoded site name with {{ .Site.Title }} | |
| 64 | -- Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }} | |
| 65 | -- Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }} | |
| 66 | -- Replace blog post lists with {{ range .Pages }} ... {{ end }} | |
| 67 | -- Replace copyright year with {{ now.Year }} | |
| 68 | -- Keep all CSS classes and HTML structure intact | |
| 69 | -- Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html | |
| 70 | - | |
| 71 | -HTML to convert: | |
| 72 | -{html} | |
| 73 | - | |
| 74 | -Return ONLY a valid JSON object, no explanation.""" | |
| 75 | - | |
| 76 | - response = call_ai(prompt, SYSTEM) | |
| 77 | - return _parse_layout_json(response) | |
| 43 | + logging.info(f"Read {len(html)} chars from {html_path}") | |
| 44 | + | |
| 45 | + # Extract <head> content (CSS links, meta, fonts, etc.) | |
| 46 | + head_extras = _extract_head_content(html) | |
| 47 | + | |
| 48 | + # Extract and rewrite CSS/JS paths to be relative to Hugo static/ | |
| 49 | + css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>', | |
| 50 | + html, re.DOTALL | re.IGNORECASE) | |
| 51 | + js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]*>.*?</script>', | |
| 52 | + html, re.DOTALL) | |
| 53 | + | |
| 54 | + # Extract <body> content | |
| 55 | + body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL) | |
| 56 | + body_content = body_match.group(1).strip() if body_match else html | |
| 57 | + | |
| 58 | + # Extract body attributes (class, style, etc.) | |
| 59 | + body_attrs_match = re.search(r'<body([^>]*)>', html) | |
| 60 | + body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else '' | |
| 61 | + | |
| 62 | + # Build baseof.html preserving the original <head> structure | |
| 63 | + head_match = re.search(r'<head[^>]*>(.*?)</head>', html, re.DOTALL) | |
| 64 | + if head_match: | |
| 65 | + head_content = head_match.group(1).strip() | |
| 66 | + # Replace hardcoded <title> with Hugo template | |
| 67 | + head_content = re.sub( | |
| 68 | + r'<title>[^<]*</title>', | |
| 69 | + '<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} | {{ .Site.Title }}{{ end }}</title>', | |
| 70 | + head_content | |
| 71 | + ) | |
| 72 | + baseof = f'''<!DOCTYPE html> | |
| 73 | +<html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}"> | |
| 74 | +<head> | |
| 75 | +{head_content} | |
| 76 | +</head> | |
| 77 | +<body{" " + body_attrs if body_attrs else ""}> | |
| 78 | + {{{{- block "main" . }}}}{{{{- end }}}} | |
| 79 | +</body> | |
| 80 | +</html>''' | |
| 81 | + else: | |
| 82 | + baseof = _fallback_baseof() | |
| 83 | + | |
| 84 | + index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}' | |
| 85 | + | |
| 86 | + layouts = { | |
| 87 | + "_default/baseof.html": baseof, | |
| 88 | + "index.html": index_html, | |
| 89 | + } | |
| 90 | + | |
| 91 | + logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)") | |
| 92 | + return layouts | |
| 78 | 93 | |
| 79 | 94 | |
| 80 | 95 | def hugoify_nextjs(info: dict, dev_url: str = None) -> dict: |
| 81 | 96 | """ |
| 82 | 97 | Convert a Next.js app to a set of Hugo layout files. |
| 83 | 98 |
| --- hugoifier/utils/hugoify.py | |
| +++ hugoifier/utils/hugoify.py | |
| @@ -28,55 +28,70 @@ | |
| 28 | |
| 29 | def hugoify_html(html_path: str) -> dict: |
| 30 | """ |
| 31 | Convert a raw HTML file to a set of Hugo layout files. |
| 32 | |
| 33 | Returns dict mapping relative layout paths to their content, e.g.: |
| 34 | { |
| 35 | "_default/baseof.html": "<!DOCTYPE html>...", |
| 36 | "partials/header.html": "<header>...", |
| 37 | "partials/footer.html": "<footer>...", |
| 38 | "index.html": "{{ define \"main\" }}...", |
| 39 | } |
| 40 | """ |
| 41 | logging.info(f"Hugoifying {html_path} ...") |
| 42 | |
| 43 | with open(html_path, 'r', errors='replace') as f: |
| 44 | html = f.read() |
| 45 | |
| 46 | # Truncate very large files to avoid token limits |
| 47 | if len(html) > 30000: |
| 48 | logging.warning(f"HTML is large ({len(html)} chars), truncating to 30000 for AI analysis") |
| 49 | html = html[:30000] |
| 50 | |
| 51 | prompt = f"""Convert the following HTML file into Hugo layout files. |
| 52 | |
| 53 | Return a JSON object where keys are relative file paths under layouts/ and values are the Hugo template content. |
| 54 | |
| 55 | Required keys to produce: |
| 56 | - "_default/baseof.html" — base template with blocks for head, header, main, footer |
| 57 | - "partials/header.html" — site header/nav extracted as partial |
| 58 | - "partials/footer.html" — footer extracted as partial |
| 59 | - "index.html" — homepage using {{ define "main" }} ... {{ end }} |
| 60 | |
| 61 | Rules: |
| 62 | - Replace hardcoded page titles with {{ .Title }} |
| 63 | - Replace hardcoded site name with {{ .Site.Title }} |
| 64 | - Replace hardcoded URLs with {{ .Site.BaseURL }} or {{ .Permalink }} |
| 65 | - Replace nav links with {{ range .Site.Menus.main }}<a href="{{ .URL }}">{{ .Name }}</a>{{ end }} |
| 66 | - Replace blog post lists with {{ range .Pages }} ... {{ end }} |
| 67 | - Replace copyright year with {{ now.Year }} |
| 68 | - Keep all CSS classes and HTML structure intact |
| 69 | - Use {{ partial "header.html" . }} and {{ partial "footer.html" . }} in baseof.html |
| 70 | |
| 71 | HTML to convert: |
| 72 | {html} |
| 73 | |
| 74 | Return ONLY a valid JSON object, no explanation.""" |
| 75 | |
| 76 | response = call_ai(prompt, SYSTEM) |
| 77 | return _parse_layout_json(response) |
| 78 | |
| 79 | |
| 80 | def hugoify_nextjs(info: dict, dev_url: str = None) -> dict: |
| 81 | """ |
| 82 | Convert a Next.js app to a set of Hugo layout files. |
| 83 |
| --- hugoifier/utils/hugoify.py | |
| +++ hugoifier/utils/hugoify.py | |
| @@ -28,55 +28,70 @@ | |
| 28 | |
| 29 | def hugoify_html(html_path: str) -> dict: |
| 30 | """ |
| 31 | Convert a raw HTML file to a set of Hugo layout files. |
| 32 | |
| 33 | Uses direct HTML extraction (no AI) to preserve content exactly as-is. |
| 34 | Splits the HTML into Hugo's baseof.html (head/shell) and index.html (body content). |
| 35 | |
| 36 | Returns dict mapping relative layout paths to their content. |
| 37 | """ |
| 38 | logging.info(f"Hugoifying {html_path} ...") |
| 39 | |
| 40 | with open(html_path, 'r', errors='replace') as f: |
| 41 | html = f.read() |
| 42 | |
| 43 | logging.info(f"Read {len(html)} chars from {html_path}") |
| 44 | |
| 45 | # Extract <head> content (CSS links, meta, fonts, etc.) |
| 46 | head_extras = _extract_head_content(html) |
| 47 | |
| 48 | # Extract and rewrite CSS/JS paths to be relative to Hugo static/ |
| 49 | css_links = re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*/?>', |
| 50 | html, re.DOTALL | re.IGNORECASE) |
| 51 | js_links = re.findall(r'<script[^>]+src=["\'][^"\']+["\'][^>]*>.*?</script>', |
| 52 | html, re.DOTALL) |
| 53 | |
| 54 | # Extract <body> content |
| 55 | body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL) |
| 56 | body_content = body_match.group(1).strip() if body_match else html |
| 57 | |
| 58 | # Extract body attributes (class, style, etc.) |
| 59 | body_attrs_match = re.search(r'<body([^>]*)>', html) |
| 60 | body_attrs = body_attrs_match.group(1).strip() if body_attrs_match else '' |
| 61 | |
| 62 | # Build baseof.html preserving the original <head> structure |
| 63 | head_match = re.search(r'<head[^>]*>(.*?)</head>', html, re.DOTALL) |
| 64 | if head_match: |
| 65 | head_content = head_match.group(1).strip() |
| 66 | # Replace hardcoded <title> with Hugo template |
| 67 | head_content = re.sub( |
| 68 | r'<title>[^<]*</title>', |
| 69 | '<title>{{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ .Title }} | {{ .Site.Title }}{{ end }}</title>', |
| 70 | head_content |
| 71 | ) |
| 72 | baseof = f'''<!DOCTYPE html> |
| 73 | <html lang="{{{{ with .Site.LanguageCode }}}}{{{{ . }}}}{{{{ else }}}}en{{{{ end }}}}"> |
| 74 | <head> |
| 75 | {head_content} |
| 76 | </head> |
| 77 | <body{" " + body_attrs if body_attrs else ""}> |
| 78 | {{{{- block "main" . }}}}{{{{- end }}}} |
| 79 | </body> |
| 80 | </html>''' |
| 81 | else: |
| 82 | baseof = _fallback_baseof() |
| 83 | |
| 84 | index_html = f'{{{{ define "main" }}}}\n{body_content}\n{{{{ end }}}}' |
| 85 | |
| 86 | layouts = { |
| 87 | "_default/baseof.html": baseof, |
| 88 | "index.html": index_html, |
| 89 | } |
| 90 | |
| 91 | logging.info(f"Extracted {len(layouts)} layout files directly from HTML (no AI)") |
| 92 | return layouts |
| 93 | |
| 94 | |
| 95 | def hugoify_nextjs(info: dict, dev_url: str = None) -> dict: |
| 96 | """ |
| 97 | Convert a Next.js app to a set of Hugo layout files. |
| 98 |