| | @@ -2014,16 +2014,13 @@ |
| 2014 | 2014 | } |
| 2015 | 2015 | free(renderer.aStack); |
| 2016 | 2016 | } |
| 2017 | 2017 | |
| 2018 | 2018 | /* |
| 2019 | | -** Get the next HTML token. |
| 2020 | | -** |
| 2021 | | -** z points to the start of a token. Return the number of |
| 2022 | | -** characters in that token. |
| 2019 | +** Return the length, in bytes, of the HTML token that z is pointing to. |
| 2023 | 2020 | */ |
| 2024 | | -static int nextHtmlToken(const char *z){ |
| 2021 | +int html_token_length(const char *z){ |
| 2025 | 2022 | int n; |
| 2026 | 2023 | char c; |
| 2027 | 2024 | if( (c=z[0])=='<' ){ |
| 2028 | 2025 | n = htmlTagLength(z); |
| 2029 | 2026 | if( n<=0 ) n = 1; |
| | @@ -2040,10 +2037,112 @@ |
| 2040 | 2037 | if( c=='<' || c=='&' || fossil_isspace(c) || c==0 ) break; |
| 2041 | 2038 | } |
| 2042 | 2039 | } |
| 2043 | 2040 | return n; |
| 2044 | 2041 | } |
| 2042 | + |
| 2043 | +/* |
| 2044 | +** z points to someplace in the middle of HTML markup. Return the length |
| 2045 | +** of the subtoken that starts on z. |
| 2046 | +*/ |
| 2047 | +int html_subtoken_length(const char *z){ |
| 2048 | + int n; |
| 2049 | + char c; |
| 2050 | + c = z[0]; |
| 2051 | + if( fossil_isspace(c) ){ |
| 2052 | + for(n=1; z[n] && fossil_isspace(z[n]); n++){} |
| 2053 | + return n; |
| 2054 | + } |
| 2055 | + if( c=='"' || c=='\'' ){ |
| 2056 | + for(n=1; z[n] && z[n]!=c && z[n]!='>'; n++){} |
| 2057 | + if( z[n]==c ) n++; |
| 2058 | + return n; |
| 2059 | + } |
| 2060 | + if( c=='>' ){ |
| 2061 | + return 0; |
| 2062 | + } |
| 2063 | + if( c=='=' ){ |
| 2064 | + return 1; |
| 2065 | + } |
| 2066 | + if( fossil_isalnum(c) || c=='/' ){ |
| 2067 | + for(n=1; (c=z[n])!=0 && (fossil_isalnum(c) || c=='-' || c=='_'); n++){} |
| 2068 | + return n; |
| 2069 | + } |
| 2070 | + return 1; |
| 2071 | +} |
| 2072 | + |
| 2073 | +/* |
| 2074 | +** z points to an HTML markup token: <TAG ATTR=VALUE ...> |
| 2075 | +** This routine looks for the VALUE associated with zAttr and returns |
| 2076 | +** a pointer to the start of that value and sets *pLen to be the length |
| 2077 | +** in bytes for the value. Or it returns NULL if no such attr exists. |
| 2078 | +*/ |
| 2079 | +const char *html_attribute(const char *zMarkup, const char *zAttr, int *pLen){ |
| 2080 | + int i = 1; |
| 2081 | + int n; |
| 2082 | + int nAttr; |
| 2083 | + int iMatchCnt = 0; |
| 2084 | + assert( zMarkup[0]=='<' ); |
| 2085 | + assert( zMarkup[1]!=0 ); |
| 2086 | + n = html_subtoken_length(zMarkup+i); |
| 2087 | + if( n==0 ) return 0; |
| 2088 | + i += n; |
| 2089 | + nAttr = (int)strlen(zAttr); |
| 2090 | + while( 1 ){ |
| 2091 | + const char *zStart = zMarkup+i; |
| 2092 | + n = html_subtoken_length(zStart); |
| 2093 | + if( n==0 ) break; |
| 2094 | + i += n; |
| 2095 | + if( fossil_isspace(zStart[0]) ) continue; |
| 2096 | + if( n==nAttr && fossil_strnicmp(zAttr,zStart,nAttr)==0 ){ |
| 2097 | + iMatchCnt = 1; |
| 2098 | + }else if( n==1 && zStart[0]=='=' && iMatchCnt==1 ){ |
| 2099 | + iMatchCnt = 2; |
| 2100 | + }else if( iMatchCnt==2 ){ |
| 2101 | + if( (zStart[0]=='"' || zStart[0]=='\'') && zStart[n-1]==zStart[0] ){ |
| 2102 | + zStart++; |
| 2103 | + n -= 2; |
| 2104 | + } |
| 2105 | + *pLen = n; |
| 2106 | + return zStart; |
| 2107 | + }else{ |
| 2108 | + iMatchCnt = 0; |
| 2109 | + } |
| 2110 | + } |
| 2111 | + return 0; |
| 2112 | +} |
| 2113 | + |
| 2114 | +/* |
| 2115 | +** COMMAND: test-html-tokenize |
| 2116 | +** |
| 2117 | +** Tokenize an HTML file. Return the offset and length and text of |
| 2118 | +** each token - one token per line. Omit white-space tokens. |
| 2119 | +*/ |
| 2120 | +void test_html_tokenize(void){ |
| 2121 | + Blob in; |
| 2122 | + char *z; |
| 2123 | + int i; |
| 2124 | + int iOfst, n; |
| 2125 | + |
| 2126 | + for(i=2; i<g.argc; i++){ |
| 2127 | + blob_read_from_file(&in, g.argv[i], ExtFILE); |
| 2128 | + z = blob_str(&in); |
| 2129 | + for(iOfst=0; z[iOfst]; iOfst+=n){ |
| 2130 | + n = html_token_length(z+iOfst); |
| 2131 | + if( fossil_isspace(z[iOfst]) ) continue; |
| 2132 | + fossil_print("%d %d %.*s\n", iOfst, n, n, z+iOfst); |
| 2133 | + if( z[iOfst]=='<' && n>1 ){ |
| 2134 | + int j,k; |
| 2135 | + for(j=iOfst+1; (k = html_subtoken_length(z+j))>0; j+=k){ |
| 2136 | + if( fossil_isspace(z[j]) || z[j]=='=' ) continue; |
| 2137 | + fossil_print("# %d %d %.*s\n", j, k, k, z+j); |
| 2138 | + } |
| 2139 | + } |
| 2140 | + } |
| 2141 | + blob_reset(&in); |
| 2142 | + } |
| 2143 | +} |
| 2045 | 2144 | |
| 2046 | 2145 | /* |
| 2047 | 2146 | ** Attempt to reformat messy HTML to be easily readable by humans. |
| 2048 | 2147 | ** |
| 2049 | 2148 | ** * Try to keep lines less than 80 characters in length |
| | @@ -2062,11 +2161,11 @@ |
| 2062 | 2161 | int nPre = 0; |
| 2063 | 2162 | int iCur = 0; |
| 2064 | 2163 | int wantSpace = 0; |
| 2065 | 2164 | int omitSpace = 1; |
| 2066 | 2165 | while( zIn[0] ){ |
| 2067 | | - n = nextHtmlToken(zIn); |
| 2166 | + n = html_token_length(zIn); |
| 2068 | 2167 | if( zIn[0]=='<' && n>1 ){ |
| 2069 | 2168 | int i, j; |
| 2070 | 2169 | int isCloseTag; |
| 2071 | 2170 | int eTag; |
| 2072 | 2171 | int eType; |
| | @@ -2181,11 +2280,11 @@ |
| 2181 | 2280 | int seenText = 0; /* True after first non-whitespace seen */ |
| 2182 | 2281 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2183 | 2282 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2184 | 2283 | while( fossil_isspace(zIn[0]) ) zIn++; |
| 2185 | 2284 | while( zIn[0] ){ |
| 2186 | | - n = nextHtmlToken(zIn); |
| 2285 | + n = html_token_length(zIn); |
| 2187 | 2286 | if( zIn[0]=='<' && n>1 ){ |
| 2188 | 2287 | int isCloseTag; |
| 2189 | 2288 | int eTag; |
| 2190 | 2289 | int eType; |
| 2191 | 2290 | char zTag[32]; |
| | @@ -2197,11 +2296,11 @@ |
| 2197 | 2296 | eTag = findTag(zTag); |
| 2198 | 2297 | eType = aMarkup[eTag].iType; |
| 2199 | 2298 | if( eTag==MARKUP_INVALID && fossil_strnicmp(zIn,"<style",6)==0 ){ |
| 2200 | 2299 | zIn += n; |
| 2201 | 2300 | while( zIn[0] ){ |
| 2202 | | - n = nextHtmlToken(zIn); |
| 2301 | + n = html_token_length(zIn); |
| 2203 | 2302 | if( fossil_strnicmp(zIn, "</style",7)==0 ) break; |
| 2204 | 2303 | zIn += n; |
| 2205 | 2304 | } |
| 2206 | 2305 | if( zIn[0]=='<' ) zIn += n; |
| 2207 | 2306 | continue; |
| 2208 | 2307 | |