Fossil SCM

Add a routine that attempts to strip all markup off of HTML text. The intended use is in the search logic.

drh 2015-01-31 19:58 trunk
Commit cbd8e67f73306549c85a79603c6d8fc1ba1ac4ea
1 file changed +72
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -2092,9 +2092,81 @@
20922092
for(i=2; i<g.argc; i++){
20932093
blob_read_from_file(&in, g.argv[i]);
20942094
blob_zero(&out);
20952095
htmlTidy(blob_str(&in), &out);
20962096
blob_reset(&in);
2097
+ fossil_puts(blob_str(&out), 0);
2098
+ blob_reset(&out);
2099
+ }
2100
+}
2101
+
2102
+/*
2103
+** Remove all HTML markup from the input text. The output written into
2104
+** pOut is pure text.
2105
+*/
2106
+void html_to_plaintext(const char *zIn, Blob *pOut){
2107
+ int n;
2108
+ int i, j;
2109
+ int nNL = 0; /* Number of \n characters at the end of pOut */
2110
+ int nWS = 0; /* True if pOut ends with whitespace */
2111
+ while( zIn[0] ){
2112
+ n = nextHtmlToken(zIn);
2113
+ if( zIn[0]=='<' && n>1 ){
2114
+ int isCloseTag;
2115
+ int eTag;
2116
+ int eType;
2117
+ char zTag[32];
2118
+ isCloseTag = zIn[1]=='/';
2119
+ for(i=0, j=1+isCloseTag; i<30 && fossil_isalnum(zIn[j]); i++, j++){
2120
+ zTag[i] = fossil_tolower(zIn[j]);
2121
+ }
2122
+ zTag[i] = 0;
2123
+ eTag = findTag(zTag);
2124
+ eType = aMarkup[eTag].iType;
2125
+ if( eTag==MARKUP_INVALID && fossil_strnicmp(zIn,"<style",6)==0 ){
2126
+ zIn += n;
2127
+ while( zIn[0] ){
2128
+ n = nextHtmlToken(zIn);
2129
+ if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
2130
+ zIn += n;
2131
+ }
2132
+ if( zIn[0]=='<' ) zIn += n;
2133
+ continue;
2134
+ }
2135
+ if( !isCloseTag && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
2136
+ if( nNL==0 ){
2137
+ blob_append(pOut, "\n", 1);
2138
+ nNL++;
2139
+ }
2140
+ nWS = 1;
2141
+ }
2142
+ }else if( fossil_isspace(zIn[0]) ){
2143
+ for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2144
+ if( !nWS ){
2145
+ blob_append(pOut, nNL ? "\n" : " ", 1);
2146
+ nWS = 1;
2147
+ }
2148
+ }else{
2149
+ blob_append(pOut, zIn, n);
2150
+ nNL = nWS = 0;
2151
+ }
2152
+ zIn += n;
2153
+ }
2154
+ if( nNL==0 ) blob_append(pOut, "\n", 1);
2155
+}
2156
+
2157
+/*
2158
+** COMMAND: test-html-to-text
2159
+*/
2160
+void test_html_to_text(void){
2161
+ Blob in, out;
2162
+ int i;
2163
+
2164
+ for(i=2; i<g.argc; i++){
2165
+ blob_read_from_file(&in, g.argv[i]);
2166
+ blob_zero(&out);
2167
+ html_to_plaintext(blob_str(&in), &out);
2168
+ blob_reset(&in);
20972169
fossil_puts(blob_str(&out), 0);
20982170
blob_reset(&out);
20992171
}
21002172
}
21012173
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -2092,9 +2092,81 @@
2092 for(i=2; i<g.argc; i++){
2093 blob_read_from_file(&in, g.argv[i]);
2094 blob_zero(&out);
2095 htmlTidy(blob_str(&in), &out);
2096 blob_reset(&in);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2097 fossil_puts(blob_str(&out), 0);
2098 blob_reset(&out);
2099 }
2100 }
2101
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -2092,9 +2092,81 @@
2092 for(i=2; i<g.argc; i++){
2093 blob_read_from_file(&in, g.argv[i]);
2094 blob_zero(&out);
2095 htmlTidy(blob_str(&in), &out);
2096 blob_reset(&in);
2097 fossil_puts(blob_str(&out), 0);
2098 blob_reset(&out);
2099 }
2100 }
2101
2102 /*
2103 ** Remove all HTML markup from the input text. The output written into
2104 ** pOut is pure text.
2105 */
2106 void html_to_plaintext(const char *zIn, Blob *pOut){
2107 int n;
2108 int i, j;
2109 int nNL = 0; /* Number of \n characters at the end of pOut */
2110 int nWS = 0; /* True if pOut ends with whitespace */
2111 while( zIn[0] ){
2112 n = nextHtmlToken(zIn);
2113 if( zIn[0]=='<' && n>1 ){
2114 int isCloseTag;
2115 int eTag;
2116 int eType;
2117 char zTag[32];
2118 isCloseTag = zIn[1]=='/';
2119 for(i=0, j=1+isCloseTag; i<30 && fossil_isalnum(zIn[j]); i++, j++){
2120 zTag[i] = fossil_tolower(zIn[j]);
2121 }
2122 zTag[i] = 0;
2123 eTag = findTag(zTag);
2124 eType = aMarkup[eTag].iType;
2125 if( eTag==MARKUP_INVALID && fossil_strnicmp(zIn,"<style",6)==0 ){
2126 zIn += n;
2127 while( zIn[0] ){
2128 n = nextHtmlToken(zIn);
2129 if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
2130 zIn += n;
2131 }
2132 if( zIn[0]=='<' ) zIn += n;
2133 continue;
2134 }
2135 if( !isCloseTag && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
2136 if( nNL==0 ){
2137 blob_append(pOut, "\n", 1);
2138 nNL++;
2139 }
2140 nWS = 1;
2141 }
2142 }else if( fossil_isspace(zIn[0]) ){
2143 for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2144 if( !nWS ){
2145 blob_append(pOut, nNL ? "\n" : " ", 1);
2146 nWS = 1;
2147 }
2148 }else{
2149 blob_append(pOut, zIn, n);
2150 nNL = nWS = 0;
2151 }
2152 zIn += n;
2153 }
2154 if( nNL==0 ) blob_append(pOut, "\n", 1);
2155 }
2156
2157 /*
2158 ** COMMAND: test-html-to-text
2159 */
2160 void test_html_to_text(void){
2161 Blob in, out;
2162 int i;
2163
2164 for(i=2; i<g.argc; i++){
2165 blob_read_from_file(&in, g.argv[i]);
2166 blob_zero(&out);
2167 html_to_plaintext(blob_str(&in), &out);
2168 blob_reset(&in);
2169 fossil_puts(blob_str(&out), 0);
2170 blob_reset(&out);
2171 }
2172 }
2173

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button