Fossil SCM
Striving to make search work better. These changes will require search indexes to be rebuilt, so they go into a branch for now.
Commit
e0df485935abab4c48f078d89f0c89d5d5f25334
Parent
ef2db66199048db…
1 file changed
+41
-3
+41
-3
| --- src/wikiformat.c | ||
| +++ src/wikiformat.c | ||
| @@ -1965,17 +1965,26 @@ | ||
| 1965 | 1965 | ** z points to the start of a token. Return the number of |
| 1966 | 1966 | ** characters in that token. |
| 1967 | 1967 | */ |
| 1968 | 1968 | static int nextHtmlToken(const char *z){ |
| 1969 | 1969 | int n; |
| 1970 | - if( z[0]=='<' ){ | |
| 1970 | + char c; | |
| 1971 | + if( (c=z[0])=='<' ){ | |
| 1971 | 1972 | n = markupLength(z); |
| 1972 | 1973 | if( n<=0 ) n = 1; |
| 1973 | - }else if( fossil_isspace(z[0]) ){ | |
| 1974 | + }else if( fossil_isspace(c) ){ | |
| 1974 | 1975 | for(n=1; z[n] && fossil_isspace(z[n]); n++){} |
| 1976 | + }else if( c=='&' ){ | |
| 1977 | + n = z[1]=='#' ? 2 : 1; | |
| 1978 | + while( fossil_isalnum(z[n]) ) n++; | |
| 1979 | + if( z[n]==';' ) n++; | |
| 1975 | 1980 | }else{ |
| 1976 | - for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){} | |
| 1981 | + n = 1; | |
| 1982 | + for(n=1; 1; n++){ | |
| 1983 | + if( (c = z[n]) > '<' ) continue; | |
| 1984 | + if( c=='<' || c=='&' || fossil_isspace(c) || c==0 ) break; | |
| 1985 | + } | |
| 1977 | 1986 | } |
| 1978 | 1987 | return n; |
| 1979 | 1988 | } |
| 1980 | 1989 | |
| 1981 | 1990 | /* |
| @@ -2106,10 +2115,11 @@ | ||
| 2106 | 2115 | void html_to_plaintext(const char *zIn, Blob *pOut){ |
| 2107 | 2116 | int n; |
| 2108 | 2117 | int i, j; |
| 2109 | 2118 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2110 | 2119 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2120 | + while( fossil_isspace(zIn[0]) ) zIn++; | |
| 2111 | 2121 | while( zIn[0] ){ |
| 2112 | 2122 | n = nextHtmlToken(zIn); |
| 2113 | 2123 | if( zIn[0]=='<' && n>1 ){ |
| 2114 | 2124 | int isCloseTag; |
| 2115 | 2125 | int eTag; |
| @@ -2142,10 +2152,38 @@ | ||
| 2142 | 2152 | }else if( fossil_isspace(zIn[0]) ){ |
| 2143 | 2153 | for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; |
| 2144 | 2154 | if( !nWS ){ |
| 2145 | 2155 | blob_append(pOut, nNL ? "\n" : " ", 1); |
| 2146 | 2156 | nWS = 1; |
| 2157 | + } | |
| 2158 | + }else if( zIn[0]=='&' ){ | |
| 2159 | + char c = '?'; | |
| 2160 | + if( zIn[1]=='#' ){ | |
| 2161 | + int x = atoi(&zIn[1]); | |
| 2162 | + if( x>0 && x<=127 ) c = x; | |
| 2163 | + }else{ | |
| 2164 | + static const struct { int n; char c; char *z; } aEntity[] = { | |
| 2165 | + { 5, '&', "&" }, | |
| 2166 | + { 4, '<', "<" }, | |
| 2167 | + { 4, '>', ">" }, | |
| 2168 | + { 6, ' ', " " }, | |
| 2169 | + }; | |
| 2170 | + int jj; | |
| 2171 | + for(jj=0; jj<ArraySize(aEntity); jj++){ | |
| 2172 | + if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){ | |
| 2173 | + c = aEntity[jj].c; | |
| 2174 | + break; | |
| 2175 | + } | |
| 2176 | + } | |
| 2177 | + } | |
| 2178 | + if( fossil_isspace(c) ){ | |
| 2179 | + if( nWS==0 ) blob_append(pOut, &c, 1); | |
| 2180 | + nWS = 1; | |
| 2181 | + nNL = c=='\n'; | |
| 2182 | + }else{ | |
| 2183 | + blob_append(pOut, &c, 1); | |
| 2184 | + nWS = nNL = 0; | |
| 2147 | 2185 | } |
| 2148 | 2186 | }else{ |
| 2149 | 2187 | blob_append(pOut, zIn, n); |
| 2150 | 2188 | nNL = nWS = 0; |
| 2151 | 2189 | } |
| 2152 | 2190 |
| --- src/wikiformat.c | |
| +++ src/wikiformat.c | |
| @@ -1965,17 +1965,26 @@ | |
| 1965 | ** z points to the start of a token. Return the number of |
| 1966 | ** characters in that token. |
| 1967 | */ |
| 1968 | static int nextHtmlToken(const char *z){ |
| 1969 | int n; |
| 1970 | if( z[0]=='<' ){ |
| 1971 | n = markupLength(z); |
| 1972 | if( n<=0 ) n = 1; |
| 1973 | }else if( fossil_isspace(z[0]) ){ |
| 1974 | for(n=1; z[n] && fossil_isspace(z[n]); n++){} |
| 1975 | }else{ |
| 1976 | for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){} |
| 1977 | } |
| 1978 | return n; |
| 1979 | } |
| 1980 | |
| 1981 | /* |
| @@ -2106,10 +2115,11 @@ | |
| 2106 | void html_to_plaintext(const char *zIn, Blob *pOut){ |
| 2107 | int n; |
| 2108 | int i, j; |
| 2109 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2110 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2111 | while( zIn[0] ){ |
| 2112 | n = nextHtmlToken(zIn); |
| 2113 | if( zIn[0]=='<' && n>1 ){ |
| 2114 | int isCloseTag; |
| 2115 | int eTag; |
| @@ -2142,10 +2152,38 @@ | |
| 2142 | }else if( fossil_isspace(zIn[0]) ){ |
| 2143 | for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; |
| 2144 | if( !nWS ){ |
| 2145 | blob_append(pOut, nNL ? "\n" : " ", 1); |
| 2146 | nWS = 1; |
| 2147 | } |
| 2148 | }else{ |
| 2149 | blob_append(pOut, zIn, n); |
| 2150 | nNL = nWS = 0; |
| 2151 | } |
| 2152 |
| --- src/wikiformat.c | |
| +++ src/wikiformat.c | |
| @@ -1965,17 +1965,26 @@ | |
| 1965 | ** z points to the start of a token. Return the number of |
| 1966 | ** characters in that token. |
| 1967 | */ |
| 1968 | static int nextHtmlToken(const char *z){ |
| 1969 | int n; |
| 1970 | char c; |
| 1971 | if( (c=z[0])=='<' ){ |
| 1972 | n = markupLength(z); |
| 1973 | if( n<=0 ) n = 1; |
| 1974 | }else if( fossil_isspace(c) ){ |
| 1975 | for(n=1; z[n] && fossil_isspace(z[n]); n++){} |
| 1976 | }else if( c=='&' ){ |
| 1977 | n = z[1]=='#' ? 2 : 1; |
| 1978 | while( fossil_isalnum(z[n]) ) n++; |
| 1979 | if( z[n]==';' ) n++; |
| 1980 | }else{ |
| 1981 | n = 1; |
| 1982 | for(n=1; 1; n++){ |
| 1983 | if( (c = z[n]) > '<' ) continue; |
| 1984 | if( c=='<' || c=='&' || fossil_isspace(c) || c==0 ) break; |
| 1985 | } |
| 1986 | } |
| 1987 | return n; |
| 1988 | } |
| 1989 | |
| 1990 | /* |
| @@ -2106,10 +2115,11 @@ | |
| 2115 | void html_to_plaintext(const char *zIn, Blob *pOut){ |
| 2116 | int n; |
| 2117 | int i, j; |
| 2118 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2119 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2120 | while( fossil_isspace(zIn[0]) ) zIn++; |
| 2121 | while( zIn[0] ){ |
| 2122 | n = nextHtmlToken(zIn); |
| 2123 | if( zIn[0]=='<' && n>1 ){ |
| 2124 | int isCloseTag; |
| 2125 | int eTag; |
| @@ -2142,10 +2152,38 @@ | |
| 2152 | }else if( fossil_isspace(zIn[0]) ){ |
| 2153 | for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; |
| 2154 | if( !nWS ){ |
| 2155 | blob_append(pOut, nNL ? "\n" : " ", 1); |
| 2156 | nWS = 1; |
| 2157 | } |
| 2158 | }else if( zIn[0]=='&' ){ |
| 2159 | char c = '?'; |
| 2160 | if( zIn[1]=='#' ){ |
| 2161 | int x = atoi(&zIn[1]); |
| 2162 | if( x>0 && x<=127 ) c = x; |
| 2163 | }else{ |
| 2164 | static const struct { int n; char c; char *z; } aEntity[] = { |
| 2165 | { 5, '&', "&" }, |
| 2166 | { 4, '<', "<" }, |
| 2167 | { 4, '>', ">" }, |
| 2168 | { 6, ' ', " " }, |
| 2169 | }; |
| 2170 | int jj; |
| 2171 | for(jj=0; jj<ArraySize(aEntity); jj++){ |
| 2172 | if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){ |
| 2173 | c = aEntity[jj].c; |
| 2174 | break; |
| 2175 | } |
| 2176 | } |
| 2177 | } |
| 2178 | if( fossil_isspace(c) ){ |
| 2179 | if( nWS==0 ) blob_append(pOut, &c, 1); |
| 2180 | nWS = 1; |
| 2181 | nNL = c=='\n'; |
| 2182 | }else{ |
| 2183 | blob_append(pOut, &c, 1); |
| 2184 | nWS = nNL = 0; |
| 2185 | } |
| 2186 | }else{ |
| 2187 | blob_append(pOut, zIn, n); |
| 2188 | nNL = nWS = 0; |
| 2189 | } |
| 2190 |