Fossil SCM
Add a routine that attempts to strip all markup off of HTML text. The intended use is in the search logic.
Commit
cbd8e67f73306549c85a79603c6d8fc1ba1ac4ea
Parent
f156a4adbfb3513…
1 file changed
+72
+72
| --- src/wikiformat.c | ||
| +++ src/wikiformat.c | ||
| @@ -2092,9 +2092,81 @@ | ||
| 2092 | 2092 | for(i=2; i<g.argc; i++){ |
| 2093 | 2093 | blob_read_from_file(&in, g.argv[i]); |
| 2094 | 2094 | blob_zero(&out); |
| 2095 | 2095 | htmlTidy(blob_str(&in), &out); |
| 2096 | 2096 | blob_reset(&in); |
| 2097 | + fossil_puts(blob_str(&out), 0); | |
| 2098 | + blob_reset(&out); | |
| 2099 | + } | |
| 2100 | +} | |
| 2101 | + | |
| 2102 | +/* | |
| 2103 | +** Remove all HTML markup from the input text. The output written into | |
| 2104 | +** pOut is pure text. | |
| 2105 | +*/ | |
| 2106 | +void html_to_plaintext(const char *zIn, Blob *pOut){ | |
| 2107 | + int n; | |
| 2108 | + int i, j; | |
| 2109 | + int nNL = 0; /* Number of \n characters at the end of pOut */ | |
| 2110 | + int nWS = 0; /* True if pOut ends with whitespace */ | |
| 2111 | + while( zIn[0] ){ | |
| 2112 | + n = nextHtmlToken(zIn); | |
| 2113 | + if( zIn[0]=='<' && n>1 ){ | |
| 2114 | + int isCloseTag; | |
| 2115 | + int eTag; | |
| 2116 | + int eType; | |
| 2117 | + char zTag[32]; | |
| 2118 | + isCloseTag = zIn[1]=='/'; | |
| 2119 | + for(i=0, j=1+isCloseTag; i<30 && fossil_isalnum(zIn[j]); i++, j++){ | |
| 2120 | + zTag[i] = fossil_tolower(zIn[j]); | |
| 2121 | + } | |
| 2122 | + zTag[i] = 0; | |
| 2123 | + eTag = findTag(zTag); | |
| 2124 | + eType = aMarkup[eTag].iType; | |
| 2125 | + if( eTag==MARKUP_INVALID && fossil_strnicmp(zIn,"<style",6)==0 ){ | |
| 2126 | + zIn += n; | |
| 2127 | + while( zIn[0] ){ | |
| 2128 | + n = nextHtmlToken(zIn); | |
| 2129 | + if( fossil_strnicmp(zIn, "</style",7)==0 ) break; | |
| 2130 | + zIn += n; | |
| 2131 | + } | |
| 2132 | + if( zIn[0]=='<' ) zIn += n; | |
| 2133 | + continue; | |
| 2134 | + } | |
| 2135 | + if( !isCloseTag && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){ | |
| 2136 | + if( nNL==0 ){ | |
| 2137 | + blob_append(pOut, "\n", 1); | |
| 2138 | + nNL++; | |
| 2139 | + } | |
| 2140 | + nWS = 1; | |
| 2141 | + } | |
| 2142 | + }else if( fossil_isspace(zIn[0]) ){ | |
| 2143 | + for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; | |
| 2144 | + if( !nWS ){ | |
| 2145 | + blob_append(pOut, nNL ? "\n" : " ", 1); | |
| 2146 | + nWS = 1; | |
| 2147 | + } | |
| 2148 | + }else{ | |
| 2149 | + blob_append(pOut, zIn, n); | |
| 2150 | + nNL = nWS = 0; | |
| 2151 | + } | |
| 2152 | + zIn += n; | |
| 2153 | + } | |
| 2154 | + if( nNL==0 ) blob_append(pOut, "\n", 1); | |
| 2155 | +} | |
| 2156 | + | |
| 2157 | +/* | |
| 2158 | +** COMMAND: test-html-to-text | |
| 2159 | +*/ | |
| 2160 | +void test_html_to_text(void){ | |
| 2161 | + Blob in, out; | |
| 2162 | + int i; | |
| 2163 | + | |
| 2164 | + for(i=2; i<g.argc; i++){ | |
| 2165 | + blob_read_from_file(&in, g.argv[i]); | |
| 2166 | + blob_zero(&out); | |
| 2167 | + html_to_plaintext(blob_str(&in), &out); | |
| 2168 | + blob_reset(&in); | |
| 2097 | 2169 | fossil_puts(blob_str(&out), 0); |
| 2098 | 2170 | blob_reset(&out); |
| 2099 | 2171 | } |
| 2100 | 2172 | } |
| 2101 | 2173 |
| --- src/wikiformat.c | |
| +++ src/wikiformat.c | |
| @@ -2092,9 +2092,81 @@ | |
| 2092 | for(i=2; i<g.argc; i++){ |
| 2093 | blob_read_from_file(&in, g.argv[i]); |
| 2094 | blob_zero(&out); |
| 2095 | htmlTidy(blob_str(&in), &out); |
| 2096 | blob_reset(&in); |
| 2097 | fossil_puts(blob_str(&out), 0); |
| 2098 | blob_reset(&out); |
| 2099 | } |
| 2100 | } |
| 2101 |
| --- src/wikiformat.c | |
| +++ src/wikiformat.c | |
| @@ -2092,9 +2092,81 @@ | |
| 2092 | for(i=2; i<g.argc; i++){ |
| 2093 | blob_read_from_file(&in, g.argv[i]); |
| 2094 | blob_zero(&out); |
| 2095 | htmlTidy(blob_str(&in), &out); |
| 2096 | blob_reset(&in); |
| 2097 | fossil_puts(blob_str(&out), 0); |
| 2098 | blob_reset(&out); |
| 2099 | } |
| 2100 | } |
| 2101 | |
| 2102 | /* |
| 2103 | ** Remove all HTML markup from the input text. The output written into |
| 2104 | ** pOut is pure text. |
| 2105 | */ |
| 2106 | void html_to_plaintext(const char *zIn, Blob *pOut){ |
| 2107 | int n; |
| 2108 | int i, j; |
| 2109 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2110 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2111 | while( zIn[0] ){ |
| 2112 | n = nextHtmlToken(zIn); |
| 2113 | if( zIn[0]=='<' && n>1 ){ |
| 2114 | int isCloseTag; |
| 2115 | int eTag; |
| 2116 | int eType; |
| 2117 | char zTag[32]; |
| 2118 | isCloseTag = zIn[1]=='/'; |
| 2119 | for(i=0, j=1+isCloseTag; i<30 && fossil_isalnum(zIn[j]); i++, j++){ |
| 2120 | zTag[i] = fossil_tolower(zIn[j]); |
| 2121 | } |
| 2122 | zTag[i] = 0; |
| 2123 | eTag = findTag(zTag); |
| 2124 | eType = aMarkup[eTag].iType; |
| 2125 | if( eTag==MARKUP_INVALID && fossil_strnicmp(zIn,"<style",6)==0 ){ |
| 2126 | zIn += n; |
| 2127 | while( zIn[0] ){ |
| 2128 | n = nextHtmlToken(zIn); |
| 2129 | if( fossil_strnicmp(zIn, "</style",7)==0 ) break; |
| 2130 | zIn += n; |
| 2131 | } |
| 2132 | if( zIn[0]=='<' ) zIn += n; |
| 2133 | continue; |
| 2134 | } |
| 2135 | if( !isCloseTag && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){ |
| 2136 | if( nNL==0 ){ |
| 2137 | blob_append(pOut, "\n", 1); |
| 2138 | nNL++; |
| 2139 | } |
| 2140 | nWS = 1; |
| 2141 | } |
| 2142 | }else if( fossil_isspace(zIn[0]) ){ |
| 2143 | for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; |
| 2144 | if( !nWS ){ |
| 2145 | blob_append(pOut, nNL ? "\n" : " ", 1); |
| 2146 | nWS = 1; |
| 2147 | } |
| 2148 | }else{ |
| 2149 | blob_append(pOut, zIn, n); |
| 2150 | nNL = nWS = 0; |
| 2151 | } |
| 2152 | zIn += n; |
| 2153 | } |
| 2154 | if( nNL==0 ) blob_append(pOut, "\n", 1); |
| 2155 | } |
| 2156 | |
| 2157 | /* |
| 2158 | ** COMMAND: test-html-to-text |
| 2159 | */ |
| 2160 | void test_html_to_text(void){ |
| 2161 | Blob in, out; |
| 2162 | int i; |
| 2163 | |
| 2164 | for(i=2; i<g.argc; i++){ |
| 2165 | blob_read_from_file(&in, g.argv[i]); |
| 2166 | blob_zero(&out); |
| 2167 | html_to_plaintext(blob_str(&in), &out); |
| 2168 | blob_reset(&in); |
| 2169 | fossil_puts(blob_str(&out), 0); |
| 2170 | blob_reset(&out); |
| 2171 | } |
| 2172 | } |
| 2173 |