Fossil SCM

Striving to make search work better. These changes will require search indexes to be rebuilt, so they go into a branch for now.

drh 2015-02-11 20:16 UTC trunk

Commit e0df485935abab4c48f078d89f0c89d5d5f25334

Parent ef2db66199048db…

1 file changed +41 -3

~ src/wikiformat.c

M src/wikiformat.c

+41 -3

		--- src/wikiformat.c
		+++ src/wikiformat.c
		@@ -1965,17 +1965,26 @@
1965	1965	** z points to the start of a token. Return the number of
1966	1966	** characters in that token.
1967	1967	*/
1968	1968	static int nextHtmlToken(const char *z){
1969	1969	int n;
1970		- if( z[0]=='<' ){
	1970	+ char c;
	1971	+ if( (c=z[0])=='<' ){
1971	1972	n = markupLength(z);
1972	1973	if( n<=0 ) n = 1;
1973		- }else if( fossil_isspace(z[0]) ){
	1974	+ }else if( fossil_isspace(c) ){
1974	1975	for(n=1; z[n] && fossil_isspace(z[n]); n++){}
	1976	+ }else if( c=='&' ){
	1977	+ n = z[1]=='#' ? 2 : 1;
	1978	+ while( fossil_isalnum(z[n]) ) n++;
	1979	+ if( z[n]==';' ) n++;
1975	1980	}else{
1976		- for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){}
	1981	+ n = 1;
	1982	+ for(n=1; 1; n++){
	1983	+ if( (c = z[n]) > '<' ) continue;
	1984	+ if( c=='<' \|\| c=='&' \|\| fossil_isspace(c) \|\| c==0 ) break;
	1985	+ }
1977	1986	}
1978	1987	return n;
1979	1988	}
1980	1989
1981	1990	/*
		@@ -2106,10 +2115,11 @@
2106	2115	void html_to_plaintext(const char zIn, Blob pOut){
2107	2116	int n;
2108	2117	int i, j;
2109	2118	int nNL = 0; /* Number of \n characters at the end of pOut */
2110	2119	int nWS = 0; /* True if pOut ends with whitespace */
	2120	+ while( fossil_isspace(zIn[0]) ) zIn++;
2111	2121	while( zIn[0] ){
2112	2122	n = nextHtmlToken(zIn);
2113	2123	if( zIn[0]=='<' && n>1 ){
2114	2124	int isCloseTag;
2115	2125	int eTag;
		@@ -2142,10 +2152,38 @@
2142	2152	}else if( fossil_isspace(zIn[0]) ){
2143	2153	for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2144	2154	if( !nWS ){
2145	2155	blob_append(pOut, nNL ? "\n" : " ", 1);
2146	2156	nWS = 1;
	2157	+ }
	2158	+ }else if( zIn[0]=='&' ){
	2159	+ char c = '?';
	2160	+ if( zIn[1]=='#' ){
	2161	+ int x = atoi(&zIn[1]);
	2162	+ if( x>0 && x<=127 ) c = x;
	2163	+ }else{
	2164	+ static const struct { int n; char c; char *z; } aEntity[] = {
	2165	+ { 5, '&', "&" },
	2166	+ { 4, '<', "<" },
	2167	+ { 4, '>', ">" },
	2168	+ { 6, ' ', " " },
	2169	+ };
	2170	+ int jj;
	2171	+ for(jj=0; jj<ArraySize(aEntity); jj++){
	2172	+ if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
	2173	+ c = aEntity[jj].c;
	2174	+ break;
	2175	+ }
	2176	+ }
	2177	+ }
	2178	+ if( fossil_isspace(c) ){
	2179	+ if( nWS==0 ) blob_append(pOut, &c, 1);
	2180	+ nWS = 1;
	2181	+ nNL = c=='\n';
	2182	+ }else{
	2183	+ blob_append(pOut, &c, 1);
	2184	+ nWS = nNL = 0;
2147	2185	}
2148	2186	}else{
2149	2187	blob_append(pOut, zIn, n);
2150	2188	nNL = nWS = 0;
2151	2189	}
2152	2190

	--- src/wikiformat.c
	+++ src/wikiformat.c
	@@ -1965,17 +1965,26 @@
1965	** z points to the start of a token. Return the number of
1966	** characters in that token.
1967	*/
1968	static int nextHtmlToken(const char *z){
1969	int n;
1970	if( z[0]=='<' ){

1971	n = markupLength(z);
1972	if( n<=0 ) n = 1;
1973	}else if( fossil_isspace(z[0]) ){
1974	for(n=1; z[n] && fossil_isspace(z[n]); n++){}




1975	}else{
1976	for(n=1; z[n] && z[n]!='<' && !fossil_isspace(z[n]); n++){}




1977	}
1978	return n;
1979	}
1980
1981	/*
	@@ -2106,10 +2115,11 @@
2106	void html_to_plaintext(const char zIn, Blob pOut){
2107	int n;
2108	int i, j;
2109	int nNL = 0; /* Number of \n characters at the end of pOut */
2110	int nWS = 0; /* True if pOut ends with whitespace */

2111	while( zIn[0] ){
2112	n = nextHtmlToken(zIn);
2113	if( zIn[0]=='<' && n>1 ){
2114	int isCloseTag;
2115	int eTag;
	@@ -2142,10 +2152,38 @@
2142	}else if( fossil_isspace(zIn[0]) ){
2143	for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2144	if( !nWS ){
2145	blob_append(pOut, nNL ? "\n" : " ", 1);
2146	nWS = 1;




























2147	}
2148	}else{
2149	blob_append(pOut, zIn, n);
2150	nNL = nWS = 0;
2151	}
2152

	--- src/wikiformat.c
	+++ src/wikiformat.c
	@@ -1965,17 +1965,26 @@
1965	** z points to the start of a token. Return the number of
1966	** characters in that token.
1967	*/
1968	static int nextHtmlToken(const char *z){
1969	int n;
1970	char c;
1971	if( (c=z[0])=='<' ){
1972	n = markupLength(z);
1973	if( n<=0 ) n = 1;
1974	}else if( fossil_isspace(c) ){
1975	for(n=1; z[n] && fossil_isspace(z[n]); n++){}
1976	}else if( c=='&' ){
1977	n = z[1]=='#' ? 2 : 1;
1978	while( fossil_isalnum(z[n]) ) n++;
1979	if( z[n]==';' ) n++;
1980	}else{
1981	n = 1;
1982	for(n=1; 1; n++){
1983	if( (c = z[n]) > '<' ) continue;
1984	if( c=='<' \|\| c=='&' \|\| fossil_isspace(c) \|\| c==0 ) break;
1985	}
1986	}
1987	return n;
1988	}
1989
1990	/*
	@@ -2106,10 +2115,11 @@
2115	void html_to_plaintext(const char zIn, Blob pOut){
2116	int n;
2117	int i, j;
2118	int nNL = 0; /* Number of \n characters at the end of pOut */
2119	int nWS = 0; /* True if pOut ends with whitespace */
2120	while( fossil_isspace(zIn[0]) ) zIn++;
2121	while( zIn[0] ){
2122	n = nextHtmlToken(zIn);
2123	if( zIn[0]=='<' && n>1 ){
2124	int isCloseTag;
2125	int eTag;
	@@ -2142,10 +2152,38 @@
2152	}else if( fossil_isspace(zIn[0]) ){
2153	for(i=nNL=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2154	if( !nWS ){
2155	blob_append(pOut, nNL ? "\n" : " ", 1);
2156	nWS = 1;
2157	}
2158	}else if( zIn[0]=='&' ){
2159	char c = '?';
2160	if( zIn[1]=='#' ){
2161	int x = atoi(&zIn[1]);
2162	if( x>0 && x<=127 ) c = x;
2163	}else{
2164	static const struct { int n; char c; char *z; } aEntity[] = {
2165	{ 5, '&', "&" },
2166	{ 4, '<', "<" },
2167	{ 4, '>', ">" },
2168	{ 6, ' ', " " },
2169	};
2170	int jj;
2171	for(jj=0; jj<ArraySize(aEntity); jj++){
2172	if( aEntity[jj].n==n && strncmp(aEntity[jj].z,zIn,n)==0 ){
2173	c = aEntity[jj].c;
2174	break;
2175	}
2176	}
2177	}
2178	if( fossil_isspace(c) ){
2179	if( nWS==0 ) blob_append(pOut, &c, 1);
2180	nWS = 1;
2181	nNL = c=='\n';
2182	}else{
2183	blob_append(pOut, &c, 1);
2184	nWS = nNL = 0;
2185	}
2186	}else{
2187	blob_append(pOut, zIn, n);
2188	nNL = nWS = 0;
2189	}
2190

Fossil SCM

Keyboard Shortcuts