Fossil SCM
More progress on the markdown #hashtag parsing.
Commit
ac5b66bb40a03f8383a6a61029519e64b0642fadc248ec2c642ff78ab026d266
Parent
5618cceb7bf9e8c…
1 file changed
+80
-9
+80
-9
| --- src/markdown.c | ||
| +++ src/markdown.c | ||
| @@ -917,43 +917,114 @@ | ||
| 917 | 917 | rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque); |
| 918 | 918 | return end; |
| 919 | 919 | } |
| 920 | 920 | |
| 921 | 921 | /* char_hashref_tag -- '#' followed by "word" characters to tag |
| 922 | - * post numbers, hashtags, etc. */ | |
| 922 | +** post numbers, hashtags, etc. | |
| 923 | +** | |
| 924 | +** Basic syntax: | |
| 925 | +** | |
| 926 | +** ^[a-zA-Z]X* | |
| 927 | +** | |
| 928 | +** Where X is: | |
| 929 | +** | |
| 930 | +** - Any number of alphanumeric characters. | |
| 931 | +** | |
| 932 | +** - Single underscores. Adjacent underscores are not recognized | |
| 933 | +** as valid hashtags. That decision is somewhat arbitrary | |
| 934 | +** and up for debate. | |
| 935 | +** | |
| 936 | +** Hashtags must end at the end of input or be followed by whitespace | |
| 937 | +** or what appears to be the end or separator of a logical | |
| 938 | +** natural-language construct, e.g. period, colon, etc. | |
| 939 | +** | |
| 940 | +** Current limitations of this implementation: | |
| 941 | +** | |
| 942 | +** - ASCII only. Support for non-ASCII characters might be | |
| 943 | +** interesting. | |
| 944 | +** | |
| 945 | +** - Currently requires starting alpha and trailing | |
| 946 | +** alphanumeric or underscores. "Should" be extended to | |
| 947 | +** handle #X[.Y], where X and optional Y are integer | |
| 948 | +** values, for forum post references. | |
| 949 | +*/ | |
| 923 | 950 | static size_t char_hashref_tag( |
| 924 | 951 | struct Blob *ob, |
| 925 | 952 | struct render *rndr, |
| 926 | 953 | char *data, |
| 927 | 954 | size_t offset, |
| 928 | 955 | size_t size |
| 929 | 956 | ){ |
| 930 | 957 | size_t end; |
| 931 | 958 | struct Blob work = BLOB_INITIALIZER; |
| 932 | - | |
| 959 | + int nUscore = 0; | |
| 933 | 960 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 934 | 961 | /* Only ever match if the *previous* character is |
| 935 | - whitespace or we're at the start of the input. */ | |
| 962 | + whitespace or we're at the start of the input. | |
| 963 | + Note that we rely on fossil processing emphasis | |
| 964 | + markup before reaching this function, so *#Hash* | |
| 965 | + will Do The Right Thing. */ | |
| 936 | 966 | return 0; |
| 937 | 967 | } |
| 938 | 968 | assert( '#' == data[0] ); |
| 939 | - if(size < 2 || !fossil_isalnum(data[1])) return 0; | |
| 940 | - /*fprintf(stderr,"HASHREF: %.*s\n", (int)size, data);*/ | |
| 941 | - for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end); | |
| 969 | + if(size < 2 || !fossil_isalpha(data[1])) return 0; | |
| 970 | +#if 0 | |
| 971 | + fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n", | |
| 972 | + (int)offset, (int)size, (int)size, data); | |
| 973 | +#endif | |
| 974 | +#define HASHTAG_LEGAL_END \ | |
| 975 | + case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?' | |
| 976 | + for(end = 2; end < size; ++end){ | |
| 977 | + char ch = data[end]; | |
| 978 | + switch(ch){ | |
| 979 | + case '_': | |
| 980 | + /* Multiple adjacent underscores not permitted. */ | |
| 981 | + if(++nUscore>1) goto hashref_bailout; | |
| 982 | + break; | |
| 983 | + HASHTAG_LEGAL_END: | |
| 984 | + if(end<3) goto hashref_bailout/*require 2+ characters (arbitrary)*/; | |
| 985 | + ch = 0; | |
| 986 | + break; | |
| 987 | + default: | |
| 988 | + if(!fossil_isalnum(ch)) goto hashref_bailout; | |
| 989 | + nUscore = 0; | |
| 990 | + break; | |
| 991 | + } | |
| 992 | + if(ch) continue; | |
| 993 | + else break; | |
| 994 | + } | |
| 995 | +#if 0 | |
| 996 | + fprintf(stderr,"?HASHREF length=%d: %.*s\n", | |
| 997 | + (int)end, (int)end, data); | |
| 998 | +#endif | |
| 942 | 999 | /*TODO: in order to support detection of forum post-style |
| 943 | 1000 | references, we need to recognize #X.Y, but only when X and Y are |
| 944 | 1001 | both purely numeric and Y ends on a word/sentence |
| 945 | 1002 | boundary.*/ |
| 946 | 1003 | if(end<size){ |
| 947 | - /* Only match if we end at a dot or space or end of input */ | |
| 948 | - if(data[end]!='.' && !fossil_isspace(data[end])){ | |
| 949 | - return 0; | |
| 1004 | + /* Only match if we end at end of input or what "might" be the end | |
| 1005 | + of a natural language grammar construct, e.g. period or | |
| 1006 | + [semi]colon. */ | |
| 1007 | + switch(data[end]){ | |
| 1008 | + HASHTAG_LEGAL_END: | |
| 1009 | + /* We could arguably treat any leading multi-byte character as | |
| 1010 | + valid here. */ | |
| 1011 | + break; | |
| 1012 | + default: | |
| 1013 | + goto hashref_bailout; | |
| 950 | 1014 | } |
| 951 | 1015 | } |
| 952 | 1016 | blob_init(&work, data + 1, end - 1); |
| 953 | 1017 | rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque); |
| 954 | 1018 | return end; |
| 1019 | + hashref_bailout: | |
| 1020 | +#if 0 | |
| 1021 | + fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n", | |
| 1022 | + (int)end, (int)end, data, (int)size, data); | |
| 1023 | +#endif | |
| 1024 | +#undef HASHTAG_LEGAL_END | |
| 1025 | + return 0; | |
| 955 | 1026 | } |
| 956 | 1027 | |
| 957 | 1028 | |
| 958 | 1029 | /* char_langle_tag -- '<' when tags or autolinks are allowed */ |
| 959 | 1030 | static size_t char_langle_tag( |
| 960 | 1031 |
| --- src/markdown.c | |
| +++ src/markdown.c | |
| @@ -917,43 +917,114 @@ | |
| 917 | rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque); |
| 918 | return end; |
| 919 | } |
| 920 | |
| 921 | /* char_hashref_tag -- '#' followed by "word" characters to tag |
| 922 | * post numbers, hashtags, etc. */ |
| 923 | static size_t char_hashref_tag( |
| 924 | struct Blob *ob, |
| 925 | struct render *rndr, |
| 926 | char *data, |
| 927 | size_t offset, |
| 928 | size_t size |
| 929 | ){ |
| 930 | size_t end; |
| 931 | struct Blob work = BLOB_INITIALIZER; |
| 932 | |
| 933 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 934 | /* Only ever match if the *previous* character is |
| 935 | whitespace or we're at the start of the input. */ |
| 936 | return 0; |
| 937 | } |
| 938 | assert( '#' == data[0] ); |
| 939 | if(size < 2 || !fossil_isalnum(data[1])) return 0; |
| 940 | /*fprintf(stderr,"HASHREF: %.*s\n", (int)size, data);*/ |
| 941 | for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end); |
| 942 | /*TODO: in order to support detection of forum post-style |
| 943 | references, we need to recognize #X.Y, but only when X and Y are |
| 944 | both purely numeric and Y ends on a word/sentence |
| 945 | boundary.*/ |
| 946 | if(end<size){ |
| 947 | /* Only match if we end at a dot or space or end of input */ |
| 948 | if(data[end]!='.' && !fossil_isspace(data[end])){ |
| 949 | return 0; |
| 950 | } |
| 951 | } |
| 952 | blob_init(&work, data + 1, end - 1); |
| 953 | rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque); |
| 954 | return end; |
| 955 | } |
| 956 | |
| 957 | |
| 958 | /* char_langle_tag -- '<' when tags or autolinks are allowed */ |
| 959 | static size_t char_langle_tag( |
| 960 |
| --- src/markdown.c | |
| +++ src/markdown.c | |
| @@ -917,43 +917,114 @@ | |
| 917 | rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque); |
| 918 | return end; |
| 919 | } |
| 920 | |
| 921 | /* char_hashref_tag -- '#' followed by "word" characters to tag |
| 922 | ** post numbers, hashtags, etc. |
| 923 | ** |
| 924 | ** Basic syntax: |
| 925 | ** |
| 926 | ** ^[a-zA-Z]X* |
| 927 | ** |
| 928 | ** Where X is: |
| 929 | ** |
| 930 | ** - Any number of alphanumeric characters. |
| 931 | ** |
| 932 | ** - Single underscores. Adjacent underscores are not recognized |
| 933 | ** as valid hashtags. That decision is somewhat arbitrary |
| 934 | ** and up for debate. |
| 935 | ** |
| 936 | ** Hashtags must end at the end of input or be followed by whitespace |
| 937 | ** or what appears to be the end or separator of a logical |
| 938 | ** natural-language construct, e.g. period, colon, etc. |
| 939 | ** |
| 940 | ** Current limitations of this implementation: |
| 941 | ** |
| 942 | ** - ASCII only. Support for non-ASCII characters might be |
| 943 | ** interesting. |
| 944 | ** |
| 945 | ** - Currently requires starting alpha and trailing |
| 946 | ** alphanumeric or underscores. "Should" be extended to |
| 947 | ** handle #X[.Y], where X and optional Y are integer |
| 948 | ** values, for forum post references. |
| 949 | */ |
| 950 | static size_t char_hashref_tag( |
| 951 | struct Blob *ob, |
| 952 | struct render *rndr, |
| 953 | char *data, |
| 954 | size_t offset, |
| 955 | size_t size |
| 956 | ){ |
| 957 | size_t end; |
| 958 | struct Blob work = BLOB_INITIALIZER; |
| 959 | int nUscore = 0; |
| 960 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 961 | /* Only ever match if the *previous* character is |
| 962 | whitespace or we're at the start of the input. |
| 963 | Note that we rely on fossil processing emphasis |
| 964 | markup before reaching this function, so *#Hash* |
| 965 | will Do The Right Thing. */ |
| 966 | return 0; |
| 967 | } |
| 968 | assert( '#' == data[0] ); |
| 969 | if(size < 2 || !fossil_isalpha(data[1])) return 0; |
| 970 | #if 0 |
| 971 | fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n", |
| 972 | (int)offset, (int)size, (int)size, data); |
| 973 | #endif |
| 974 | #define HASHTAG_LEGAL_END \ |
| 975 | case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?' |
| 976 | for(end = 2; end < size; ++end){ |
| 977 | char ch = data[end]; |
| 978 | switch(ch){ |
| 979 | case '_': |
| 980 | /* Multiple adjacent underscores not permitted. */ |
| 981 | if(++nUscore>1) goto hashref_bailout; |
| 982 | break; |
| 983 | HASHTAG_LEGAL_END: |
| 984 | if(end<3) goto hashref_bailout/*require 2+ characters (arbitrary)*/; |
| 985 | ch = 0; |
| 986 | break; |
| 987 | default: |
| 988 | if(!fossil_isalnum(ch)) goto hashref_bailout; |
| 989 | nUscore = 0; |
| 990 | break; |
| 991 | } |
| 992 | if(ch) continue; |
| 993 | else break; |
| 994 | } |
| 995 | #if 0 |
| 996 | fprintf(stderr,"?HASHREF length=%d: %.*s\n", |
| 997 | (int)end, (int)end, data); |
| 998 | #endif |
| 999 | /*TODO: in order to support detection of forum post-style |
| 1000 | references, we need to recognize #X.Y, but only when X and Y are |
| 1001 | both purely numeric and Y ends on a word/sentence |
| 1002 | boundary.*/ |
| 1003 | if(end<size){ |
| 1004 | /* Only match if we end at end of input or what "might" be the end |
| 1005 | of a natural language grammar construct, e.g. period or |
| 1006 | [semi]colon. */ |
| 1007 | switch(data[end]){ |
| 1008 | HASHTAG_LEGAL_END: |
| 1009 | /* We could arguably treat any leading multi-byte character as |
| 1010 | valid here. */ |
| 1011 | break; |
| 1012 | default: |
| 1013 | goto hashref_bailout; |
| 1014 | } |
| 1015 | } |
| 1016 | blob_init(&work, data + 1, end - 1); |
| 1017 | rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque); |
| 1018 | return end; |
| 1019 | hashref_bailout: |
| 1020 | #if 0 |
| 1021 | fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n", |
| 1022 | (int)end, (int)end, data, (int)size, data); |
| 1023 | #endif |
| 1024 | #undef HASHTAG_LEGAL_END |
| 1025 | return 0; |
| 1026 | } |
| 1027 | |
| 1028 | |
| 1029 | /* char_langle_tag -- '<' when tags or autolinks are allowed */ |
| 1030 | static size_t char_langle_tag( |
| 1031 |