Fossil SCM

Striving to make search work better. These changes will require search indexes to be rebuilt, so they go into a branch for now.

drh 2015-02-11 20:16 UTC trunk
Commit e0df485935abab4c48f078d89f0c89d5d5f25334
1 file changed +41 -3
+41 -3
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1965,17 +1965,26 @@
19651965
** z points to the start of a token. Return the number of
19661966
** characters in that token.
19671967
*/
19681968
static int nextHtmlToken(const char *z){
19691969
int n;
1970
- if( z[0]=='<' ){
1970
+ char c;
1971
+ if( (c=z[0])=='<' ){
19711972
n = markupLength(z);
19721973
if( n<=0 ) n = 1;
1973
- }else if( fossil_isspace(z[0]) ){
1974
+ }else if( fossil_isspace(c) ){
19741975
for(n=1; z[n] && fossil_isspace(z[n]); n++){}
1976
+ }else if( c=='&' ){
1977
+ n = z[1]=='#' ? 2 : 1;
1978
+ while( fossil_isalnum(z[n]) ) n++;
1979
+ if( z[n]==';' ) n++;
19751980
}else{
1976
- for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){}
1981
+ n = 1;
1982
+ for(n=1; 1; n++){
1983
+ if( (c = z[n]) > '<' ) continue;
1984
+ if( c=='<' || c=='&' || fossil_isspace(c) || c==0 ) break;
1985
+ }
19771986
}
19781987
return n;
19791988
}
19801989
19811990
/*
@@ -2106,10 +2115,11 @@
21062115
void html_to_plaintext(const char *zIn, Blob *pOut){
21072116
int n;
21082117
int i, j;
21092118
int nNL = 0; /* Number of \n characters at the end of pOut */
21102119
int nWS = 0; /* True if pOut ends with whitespace */
2120
+ while( fossil_isspace(zIn[0]) ) zIn++;
21112121
while( zIn[0] ){
21122122
n = nextHtmlToken(zIn);
21132123
if( zIn[0]=='<' && n>1 ){
21142124
int isCloseTag;
21152125
int eTag;
@@ -2142,10 +2152,38 @@
21422152
}else if( fossil_isspace(zIn[0]) ){
21432153
for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
21442154
if( !nWS ){
21452155
blob_append(pOut, nNL ? "\n" : " ", 1);
21462156
nWS = 1;
2157
+ }
2158
+ }else if( zIn[0]=='&' ){
2159
+ char c = '?';
2160
+ if( zIn[1]=='#' ){
2161
+ int x = atoi(&zIn[1]);
2162
+ if( x>0 && x<=127 ) c = x;
2163
+ }else{
2164
+ static const struct { int n; char c; char *z; } aEntity[] = {
2165
+ { 5, '&', "&amp;" },
2166
+ { 4, '<', "&lt;" },
2167
+ { 4, '>', "&gt;" },
2168
+ { 6, ' ', "&nbsp;" },
2169
+ };
2170
+ int jj;
2171
+ for(jj=0; jj<ArraySize(aEntity); jj++){
2172
+ if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
2173
+ c = aEntity[jj].c;
2174
+ break;
2175
+ }
2176
+ }
2177
+ }
2178
+ if( fossil_isspace(c) ){
2179
+ if( nWS==0 ) blob_append(pOut, &c, 1);
2180
+ nWS = 1;
2181
+ nNL = c=='\n';
2182
+ }else{
2183
+ blob_append(pOut, &c, 1);
2184
+ nWS = nNL = 0;
21472185
}
21482186
}else{
21492187
blob_append(pOut, zIn, n);
21502188
nNL = nWS = 0;
21512189
}
21522190
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1965,17 +1965,26 @@
1965 ** z points to the start of a token. Return the number of
1966 ** characters in that token.
1967 */
1968 static int nextHtmlToken(const char *z){
1969 int n;
1970 if( z[0]=='<' ){
 
1971 n = markupLength(z);
1972 if( n<=0 ) n = 1;
1973 }else if( fossil_isspace(z[0]) ){
1974 for(n=1; z[n] && fossil_isspace(z[n]); n++){}
 
 
 
 
1975 }else{
1976 for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){}
 
 
 
 
1977 }
1978 return n;
1979 }
1980
1981 /*
@@ -2106,10 +2115,11 @@
2106 void html_to_plaintext(const char *zIn, Blob *pOut){
2107 int n;
2108 int i, j;
2109 int nNL = 0; /* Number of \n characters at the end of pOut */
2110 int nWS = 0; /* True if pOut ends with whitespace */
 
2111 while( zIn[0] ){
2112 n = nextHtmlToken(zIn);
2113 if( zIn[0]=='<' && n>1 ){
2114 int isCloseTag;
2115 int eTag;
@@ -2142,10 +2152,38 @@
2142 }else if( fossil_isspace(zIn[0]) ){
2143 for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2144 if( !nWS ){
2145 blob_append(pOut, nNL ? "\n" : " ", 1);
2146 nWS = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2147 }
2148 }else{
2149 blob_append(pOut, zIn, n);
2150 nNL = nWS = 0;
2151 }
2152
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1965,17 +1965,26 @@
1965 ** z points to the start of a token. Return the number of
1966 ** characters in that token.
1967 */
1968 static int nextHtmlToken(const char *z){
1969 int n;
1970 char c;
1971 if( (c=z[0])=='<' ){
1972 n = markupLength(z);
1973 if( n<=0 ) n = 1;
1974 }else if( fossil_isspace(c) ){
1975 for(n=1; z[n] && fossil_isspace(z[n]); n++){}
1976 }else if( c=='&' ){
1977 n = z[1]=='#' ? 2 : 1;
1978 while( fossil_isalnum(z[n]) ) n++;
1979 if( z[n]==';' ) n++;
1980 }else{
1981 n = 1;
1982 for(n=1; 1; n++){
1983 if( (c = z[n]) > '<' ) continue;
1984 if( c=='<' || c=='&' || fossil_isspace(c) || c==0 ) break;
1985 }
1986 }
1987 return n;
1988 }
1989
1990 /*
@@ -2106,10 +2115,11 @@
2115 void html_to_plaintext(const char *zIn, Blob *pOut){
2116 int n;
2117 int i, j;
2118 int nNL = 0; /* Number of \n characters at the end of pOut */
2119 int nWS = 0; /* True if pOut ends with whitespace */
2120 while( fossil_isspace(zIn[0]) ) zIn++;
2121 while( zIn[0] ){
2122 n = nextHtmlToken(zIn);
2123 if( zIn[0]=='<' && n>1 ){
2124 int isCloseTag;
2125 int eTag;
@@ -2142,10 +2152,38 @@
2152 }else if( fossil_isspace(zIn[0]) ){
2153 for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2154 if( !nWS ){
2155 blob_append(pOut, nNL ? "\n" : " ", 1);
2156 nWS = 1;
2157 }
2158 }else if( zIn[0]=='&' ){
2159 char c = '?';
2160 if( zIn[1]=='#' ){
2161 int x = atoi(&zIn[1]);
2162 if( x>0 && x<=127 ) c = x;
2163 }else{
2164 static const struct { int n; char c; char *z; } aEntity[] = {
2165 { 5, '&', "&amp;" },
2166 { 4, '<', "&lt;" },
2167 { 4, '>', "&gt;" },
2168 { 6, ' ', "&nbsp;" },
2169 };
2170 int jj;
2171 for(jj=0; jj<ArraySize(aEntity); jj++){
2172 if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
2173 c = aEntity[jj].c;
2174 break;
2175 }
2176 }
2177 }
2178 if( fossil_isspace(c) ){
2179 if( nWS==0 ) blob_append(pOut, &c, 1);
2180 nWS = 1;
2181 nNL = c=='\n';
2182 }else{
2183 blob_append(pOut, &c, 1);
2184 nWS = nNL = 0;
2185 }
2186 }else{
2187 blob_append(pOut, zIn, n);
2188 nNL = nWS = 0;
2189 }
2190

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button