Fossil SCM
Any non-ASCII characters are now considered valid for use in hashtags.
Commit
4f68a1306f7236f58ccd8ee24e0eb5a609eb7f0f30edf60d73320c6a848b6c41
Parent
7cae4c09813bacc…
2 files changed
+37
-15
+3
-10
+37
-15
| --- src/markdown.c | ||
| +++ src/markdown.c | ||
| @@ -43,11 +43,12 @@ | ||
| 43 | 43 | }; |
| 44 | 44 | |
| 45 | 45 | /* mkd_tagspan -- type of tagged <span> */ |
| 46 | 46 | enum mkd_tagspan { |
| 47 | 47 | MKDT_ATREF, /* @name references, as in /chat attention targeting */ |
| 48 | - MKDT_HASHTAG, /* #hash tags, message IDs, etc. */ | |
| 48 | + MKDT_HASHTAG, /* #hashtags */ | |
| 49 | + MKDT_NUMTAG /* #123[.456] /chat or /forum message IDs. */ | |
| 49 | 50 | }; |
| 50 | 51 | |
| 51 | 52 | /* mkd_renderer -- functions for rendering parsed data */ |
| 52 | 53 | struct mkd_renderer { |
| 53 | 54 | /* document level callbacks */ |
| @@ -954,11 +955,11 @@ | ||
| 954 | 955 | size_t offset, |
| 955 | 956 | size_t size |
| 956 | 957 | ){ |
| 957 | 958 | size_t end; |
| 958 | 959 | struct Blob work = BLOB_INITIALIZER; |
| 959 | - int nUscore = 0; /* Consecutive underscore counter */; | |
| 960 | + int nUscore = 0; /* Consecutive underscore counter */ | |
| 960 | 961 | int numberMode = 0 /* 0 for normal, 1 for #NNN numeric, |
| 961 | 962 | and 2 for #NNN.NNN. */; |
| 962 | 963 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 963 | 964 | /* Only ever match if the *previous* character is whitespace or |
| 964 | 965 | we're at the start of the input. Note that we rely on fossil |
| @@ -967,56 +968,75 @@ | ||
| 967 | 968 | "#Hash." will match while ".#Hash" won't. That's okay. */ |
| 968 | 969 | return 0; |
| 969 | 970 | } |
| 970 | 971 | assert( '#' == data[0] ); |
| 971 | 972 | if(size < 2) return 0; |
| 973 | + end = 2; | |
| 972 | 974 | if(fossil_isdigit(data[1])){ |
| 973 | 975 | numberMode = 1; |
| 974 | 976 | }else if(!fossil_isalpha(data[1])){ |
| 975 | - return 0; | |
| 977 | + switch(data[1] & 0xF0){ | |
| 978 | + /* Reminder: UTF8 char lengths can be determined by | |
| 979 | + masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, | |
| 980 | + else 1. */ | |
| 981 | + case 0xF0: end+=3; break; | |
| 982 | + case 0xE0: end+=2; break; | |
| 983 | + case 0xC0: end+=1; break; | |
| 984 | + default: return 0; | |
| 985 | + } | |
| 976 | 986 | } |
| 977 | 987 | #if 0 |
| 978 | 988 | fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n", |
| 979 | 989 | (int)offset, (int)size, (int)size, data); |
| 980 | 990 | #endif |
| 981 | 991 | #define HASHTAG_LEGAL_END \ |
| 982 | 992 | case ' ': case '\t': case '\r': case '\n': \ |
| 983 | 993 | case ':': case ';': case '!': case '?': case ',' |
| 984 | 994 | /* ^^^^ '.' is handled separately */ |
| 985 | - for(end = 2; end < size; ++end){ | |
| 995 | + for(; end < size; ++end){ | |
| 986 | 996 | char ch = data[end]; |
| 987 | - /* Potential TODO: if (ch & 0xF0), treat it as valid, skip that | |
| 988 | - multi-byte character's length characters, and continue | |
| 989 | - looping. Reminder: UTF8 char lengths can be determined by | |
| 990 | - masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, else 1. */ | |
| 997 | + switch(ch & 0xF0){ | |
| 998 | + case 0xF0: end+=3; continue; | |
| 999 | + case 0xE0: end+=2; continue; | |
| 1000 | + case 0xC0: end+=1; continue; | |
| 1001 | + case 0x80: goto hashref_bailout /*invalid UTF8*/; | |
| 1002 | + default: break; | |
| 1003 | + } | |
| 1004 | +#if 0 | |
| 1005 | + fprintf(stderr,"hashtag? checking... length=%d: %.*s\n", | |
| 1006 | + (int)end, (int)end, data); | |
| 1007 | +#endif | |
| 991 | 1008 | switch(ch){ |
| 992 | 1009 | case '_': |
| 993 | 1010 | /* Multiple adjacent underscores not permitted. */ |
| 994 | - if(numberMode>0 || ++nUscore>1) goto hashref_bailout; | |
| 1011 | + if(++nUscore>1) goto hashref_bailout; | |
| 1012 | + numberMode = 0; | |
| 995 | 1013 | break; |
| 996 | 1014 | case '.': |
| 997 | 1015 | if(1==numberMode) ++numberMode; |
| 998 | 1016 | ch = 0; |
| 999 | 1017 | break; |
| 1000 | 1018 | HASHTAG_LEGAL_END: |
| 1001 | - if(numberMode==0 && end<3){ | |
| 1002 | - goto hashref_bailout/*require 2+ characters (arbitrary)*/; | |
| 1003 | - } | |
| 1004 | 1019 | ch = 0; |
| 1005 | 1020 | break; |
| 1006 | 1021 | case '0': case '1': case '2': case '3': case '4': |
| 1007 | 1022 | case '5': case '6': case '7': case '8': case '9': |
| 1023 | + nUscore = 0; | |
| 1008 | 1024 | break; |
| 1009 | 1025 | default: |
| 1010 | - if(numberMode!=0 || !fossil_isalpha(ch)){ | |
| 1026 | + if(numberMode || !fossil_isalpha(ch)){ | |
| 1011 | 1027 | goto hashref_bailout; |
| 1012 | 1028 | } |
| 1013 | 1029 | nUscore = 0; |
| 1014 | 1030 | break; |
| 1015 | 1031 | } |
| 1016 | 1032 | if(ch) continue; |
| 1017 | 1033 | break; |
| 1034 | + } | |
| 1035 | + if((end<3/* #. or some such */ && !numberMode) | |
| 1036 | + || end>size/*from truncated multi-byte char*/){ | |
| 1037 | + return 0; | |
| 1018 | 1038 | } |
| 1019 | 1039 | if(numberMode>1){ |
| 1020 | 1040 | /* Check for trailing part of #NNN.nnn... */ |
| 1021 | 1041 | assert('.'==data[end]); |
| 1022 | 1042 | if(end<size-1 && fossil_isdigit(data[end+1])){ |
| @@ -1024,11 +1044,11 @@ | ||
| 1024 | 1044 | if(!fossil_isdigit(data[end])) break; |
| 1025 | 1045 | } |
| 1026 | 1046 | } |
| 1027 | 1047 | } |
| 1028 | 1048 | #if 0 |
| 1029 | - fprintf(stderr,"?HASHREF length=%d: %.*s\n", | |
| 1049 | + fprintf(stderr,"???HASHREF length=%d: %.*s\n", | |
| 1030 | 1050 | (int)end, (int)end, data); |
| 1031 | 1051 | #endif |
| 1032 | 1052 | if(end<size){ |
| 1033 | 1053 | /* Only match if we end at end of input or what "might" be the end |
| 1034 | 1054 | of a natural language grammar construct, e.g. period or |
| @@ -1040,11 +1060,13 @@ | ||
| 1040 | 1060 | default: |
| 1041 | 1061 | goto hashref_bailout; |
| 1042 | 1062 | } |
| 1043 | 1063 | } |
| 1044 | 1064 | blob_init(&work, data + 1, end - 1); |
| 1045 | - rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque); | |
| 1065 | + rndr->make.tagspan(ob, &work, | |
| 1066 | + numberMode ? MKDT_NUMTAG : MKDT_HASHTAG, | |
| 1067 | + rndr->make.opaque); | |
| 1046 | 1068 | return end; |
| 1047 | 1069 | hashref_bailout: |
| 1048 | 1070 | #if 0 |
| 1049 | 1071 | fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n", |
| 1050 | 1072 | (int)end, (int)end, data, (int)size, data); |
| 1051 | 1073 |
| --- src/markdown.c | |
| +++ src/markdown.c | |
| @@ -43,11 +43,12 @@ | |
| 43 | }; |
| 44 | |
| 45 | /* mkd_tagspan -- type of tagged <span> */ |
| 46 | enum mkd_tagspan { |
| 47 | MKDT_ATREF, /* @name references, as in /chat attention targeting */ |
| 48 | MKDT_HASHTAG, /* #hash tags, message IDs, etc. */ |
| 49 | }; |
| 50 | |
| 51 | /* mkd_renderer -- functions for rendering parsed data */ |
| 52 | struct mkd_renderer { |
| 53 | /* document level callbacks */ |
| @@ -954,11 +955,11 @@ | |
| 954 | size_t offset, |
| 955 | size_t size |
| 956 | ){ |
| 957 | size_t end; |
| 958 | struct Blob work = BLOB_INITIALIZER; |
| 959 | int nUscore = 0; /* Consecutive underscore counter */; |
| 960 | int numberMode = 0 /* 0 for normal, 1 for #NNN numeric, |
| 961 | and 2 for #NNN.NNN. */; |
| 962 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 963 | /* Only ever match if the *previous* character is whitespace or |
| 964 | we're at the start of the input. Note that we rely on fossil |
| @@ -967,56 +968,75 @@ | |
| 967 | "#Hash." will match while ".#Hash" won't. That's okay. */ |
| 968 | return 0; |
| 969 | } |
| 970 | assert( '#' == data[0] ); |
| 971 | if(size < 2) return 0; |
| 972 | if(fossil_isdigit(data[1])){ |
| 973 | numberMode = 1; |
| 974 | }else if(!fossil_isalpha(data[1])){ |
| 975 | return 0; |
| 976 | } |
| 977 | #if 0 |
| 978 | fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n", |
| 979 | (int)offset, (int)size, (int)size, data); |
| 980 | #endif |
| 981 | #define HASHTAG_LEGAL_END \ |
| 982 | case ' ': case '\t': case '\r': case '\n': \ |
| 983 | case ':': case ';': case '!': case '?': case ',' |
| 984 | /* ^^^^ '.' is handled separately */ |
| 985 | for(end = 2; end < size; ++end){ |
| 986 | char ch = data[end]; |
| 987 | /* Potential TODO: if (ch & 0xF0), treat it as valid, skip that |
| 988 | multi-byte character's length characters, and continue |
| 989 | looping. Reminder: UTF8 char lengths can be determined by |
| 990 | masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, else 1. */ |
| 991 | switch(ch){ |
| 992 | case '_': |
| 993 | /* Multiple adjacent underscores not permitted. */ |
| 994 | if(numberMode>0 || ++nUscore>1) goto hashref_bailout; |
| 995 | break; |
| 996 | case '.': |
| 997 | if(1==numberMode) ++numberMode; |
| 998 | ch = 0; |
| 999 | break; |
| 1000 | HASHTAG_LEGAL_END: |
| 1001 | if(numberMode==0 && end<3){ |
| 1002 | goto hashref_bailout/*require 2+ characters (arbitrary)*/; |
| 1003 | } |
| 1004 | ch = 0; |
| 1005 | break; |
| 1006 | case '0': case '1': case '2': case '3': case '4': |
| 1007 | case '5': case '6': case '7': case '8': case '9': |
| 1008 | break; |
| 1009 | default: |
| 1010 | if(numberMode!=0 || !fossil_isalpha(ch)){ |
| 1011 | goto hashref_bailout; |
| 1012 | } |
| 1013 | nUscore = 0; |
| 1014 | break; |
| 1015 | } |
| 1016 | if(ch) continue; |
| 1017 | break; |
| 1018 | } |
| 1019 | if(numberMode>1){ |
| 1020 | /* Check for trailing part of #NNN.nnn... */ |
| 1021 | assert('.'==data[end]); |
| 1022 | if(end<size-1 && fossil_isdigit(data[end+1])){ |
| @@ -1024,11 +1044,11 @@ | |
| 1024 | if(!fossil_isdigit(data[end])) break; |
| 1025 | } |
| 1026 | } |
| 1027 | } |
| 1028 | #if 0 |
| 1029 | fprintf(stderr,"?HASHREF length=%d: %.*s\n", |
| 1030 | (int)end, (int)end, data); |
| 1031 | #endif |
| 1032 | if(end<size){ |
| 1033 | /* Only match if we end at end of input or what "might" be the end |
| 1034 | of a natural language grammar construct, e.g. period or |
| @@ -1040,11 +1060,13 @@ | |
| 1040 | default: |
| 1041 | goto hashref_bailout; |
| 1042 | } |
| 1043 | } |
| 1044 | blob_init(&work, data + 1, end - 1); |
| 1045 | rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque); |
| 1046 | return end; |
| 1047 | hashref_bailout: |
| 1048 | #if 0 |
| 1049 | fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n", |
| 1050 | (int)end, (int)end, data, (int)size, data); |
| 1051 |
| --- src/markdown.c | |
| +++ src/markdown.c | |
| @@ -43,11 +43,12 @@ | |
| 43 | }; |
| 44 | |
| 45 | /* mkd_tagspan -- type of tagged <span> */ |
| 46 | enum mkd_tagspan { |
| 47 | MKDT_ATREF, /* @name references, as in /chat attention targeting */ |
| 48 | MKDT_HASHTAG, /* #hashtags */ |
| 49 | MKDT_NUMTAG /* #123[.456] /chat or /forum message IDs. */ |
| 50 | }; |
| 51 | |
| 52 | /* mkd_renderer -- functions for rendering parsed data */ |
| 53 | struct mkd_renderer { |
| 54 | /* document level callbacks */ |
| @@ -954,11 +955,11 @@ | |
| 955 | size_t offset, |
| 956 | size_t size |
| 957 | ){ |
| 958 | size_t end; |
| 959 | struct Blob work = BLOB_INITIALIZER; |
| 960 | int nUscore = 0; /* Consecutive underscore counter */ |
| 961 | int numberMode = 0 /* 0 for normal, 1 for #NNN numeric, |
| 962 | and 2 for #NNN.NNN. */; |
| 963 | if(offset>0 && !fossil_isspace(data[-1])){ |
| 964 | /* Only ever match if the *previous* character is whitespace or |
| 965 | we're at the start of the input. Note that we rely on fossil |
| @@ -967,56 +968,75 @@ | |
| 968 | "#Hash." will match while ".#Hash" won't. That's okay. */ |
| 969 | return 0; |
| 970 | } |
| 971 | assert( '#' == data[0] ); |
| 972 | if(size < 2) return 0; |
| 973 | end = 2; |
| 974 | if(fossil_isdigit(data[1])){ |
| 975 | numberMode = 1; |
| 976 | }else if(!fossil_isalpha(data[1])){ |
| 977 | switch(data[1] & 0xF0){ |
| 978 | /* Reminder: UTF8 char lengths can be determined by |
| 979 | masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, |
| 980 | else 1. */ |
| 981 | case 0xF0: end+=3; break; |
| 982 | case 0xE0: end+=2; break; |
| 983 | case 0xC0: end+=1; break; |
| 984 | default: return 0; |
| 985 | } |
| 986 | } |
| 987 | #if 0 |
| 988 | fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n", |
| 989 | (int)offset, (int)size, (int)size, data); |
| 990 | #endif |
| 991 | #define HASHTAG_LEGAL_END \ |
| 992 | case ' ': case '\t': case '\r': case '\n': \ |
| 993 | case ':': case ';': case '!': case '?': case ',' |
| 994 | /* ^^^^ '.' is handled separately */ |
| 995 | for(; end < size; ++end){ |
| 996 | char ch = data[end]; |
| 997 | switch(ch & 0xF0){ |
| 998 | case 0xF0: end+=3; continue; |
| 999 | case 0xE0: end+=2; continue; |
| 1000 | case 0xC0: end+=1; continue; |
| 1001 | case 0x80: goto hashref_bailout /*invalid UTF8*/; |
| 1002 | default: break; |
| 1003 | } |
| 1004 | #if 0 |
| 1005 | fprintf(stderr,"hashtag? checking... length=%d: %.*s\n", |
| 1006 | (int)end, (int)end, data); |
| 1007 | #endif |
| 1008 | switch(ch){ |
| 1009 | case '_': |
| 1010 | /* Multiple adjacent underscores not permitted. */ |
| 1011 | if(++nUscore>1) goto hashref_bailout; |
| 1012 | numberMode = 0; |
| 1013 | break; |
| 1014 | case '.': |
| 1015 | if(1==numberMode) ++numberMode; |
| 1016 | ch = 0; |
| 1017 | break; |
| 1018 | HASHTAG_LEGAL_END: |
| 1019 | ch = 0; |
| 1020 | break; |
| 1021 | case '0': case '1': case '2': case '3': case '4': |
| 1022 | case '5': case '6': case '7': case '8': case '9': |
| 1023 | nUscore = 0; |
| 1024 | break; |
| 1025 | default: |
| 1026 | if(numberMode || !fossil_isalpha(ch)){ |
| 1027 | goto hashref_bailout; |
| 1028 | } |
| 1029 | nUscore = 0; |
| 1030 | break; |
| 1031 | } |
| 1032 | if(ch) continue; |
| 1033 | break; |
| 1034 | } |
| 1035 | if((end<3/* #. or some such */ && !numberMode) |
| 1036 | || end>size/*from truncated multi-byte char*/){ |
| 1037 | return 0; |
| 1038 | } |
| 1039 | if(numberMode>1){ |
| 1040 | /* Check for trailing part of #NNN.nnn... */ |
| 1041 | assert('.'==data[end]); |
| 1042 | if(end<size-1 && fossil_isdigit(data[end+1])){ |
| @@ -1024,11 +1044,11 @@ | |
| 1044 | if(!fossil_isdigit(data[end])) break; |
| 1045 | } |
| 1046 | } |
| 1047 | } |
| 1048 | #if 0 |
| 1049 | fprintf(stderr,"???HASHREF length=%d: %.*s\n", |
| 1050 | (int)end, (int)end, data); |
| 1051 | #endif |
| 1052 | if(end<size){ |
| 1053 | /* Only match if we end at end of input or what "might" be the end |
| 1054 | of a natural language grammar construct, e.g. period or |
| @@ -1040,11 +1060,13 @@ | |
| 1060 | default: |
| 1061 | goto hashref_bailout; |
| 1062 | } |
| 1063 | } |
| 1064 | blob_init(&work, data + 1, end - 1); |
| 1065 | rndr->make.tagspan(ob, &work, |
| 1066 | numberMode ? MKDT_NUMTAG : MKDT_HASHTAG, |
| 1067 | rndr->make.opaque); |
| 1068 | return end; |
| 1069 | hashref_bailout: |
| 1070 | #if 0 |
| 1071 | fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n", |
| 1072 | (int)end, (int)end, data, (int)size, data); |
| 1073 |
+3
-10
| --- src/markdown_html.c | ||
| +++ src/markdown_html.c | ||
| @@ -553,20 +553,13 @@ | ||
| 553 | 553 | BLOB_APPEND_LITERAL(ob, "<span data-"); |
| 554 | 554 | switch (type) { |
| 555 | 555 | case MKDT_ATREF: |
| 556 | 556 | cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break; |
| 557 | 557 | case MKDT_HASHTAG: |
| 558 | - cPrefix = '#'; | |
| 559 | - if(fossil_isdigit(*blob_str(text))){ | |
| 560 | - /* This is a #NNN or #NNN.NNN reference. Mark it differently | |
| 561 | - because these will be handled differently by higher-level | |
| 562 | - code than conventional hashtags will. */ | |
| 563 | - BLOB_APPEND_LITERAL(ob, "numtag"); | |
| 564 | - }else{ | |
| 565 | - BLOB_APPEND_LITERAL(ob, "hashtag"); | |
| 566 | - } | |
| 567 | - break; | |
| 558 | + cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "hashtag"); break; | |
| 559 | + case MKDT_NUMTAG: | |
| 560 | + cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "numtag"); break; | |
| 568 | 561 | } |
| 569 | 562 | BLOB_APPEND_LITERAL(ob, "=\""); |
| 570 | 563 | html_quote(ob, blob_buffer(text), blob_size(text)); |
| 571 | 564 | BLOB_APPEND_LITERAL(ob, "\""); |
| 572 | 565 | blob_appendf(ob, ">%c%b</span>", cPrefix,text); |
| 573 | 566 |
| --- src/markdown_html.c | |
| +++ src/markdown_html.c | |
| @@ -553,20 +553,13 @@ | |
| 553 | BLOB_APPEND_LITERAL(ob, "<span data-"); |
| 554 | switch (type) { |
| 555 | case MKDT_ATREF: |
| 556 | cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break; |
| 557 | case MKDT_HASHTAG: |
| 558 | cPrefix = '#'; |
| 559 | if(fossil_isdigit(*blob_str(text))){ |
| 560 | /* This is a #NNN or #NNN.NNN reference. Mark it differently |
| 561 | because these will be handled differently by higher-level |
| 562 | code than conventional hashtags will. */ |
| 563 | BLOB_APPEND_LITERAL(ob, "numtag"); |
| 564 | }else{ |
| 565 | BLOB_APPEND_LITERAL(ob, "hashtag"); |
| 566 | } |
| 567 | break; |
| 568 | } |
| 569 | BLOB_APPEND_LITERAL(ob, "=\""); |
| 570 | html_quote(ob, blob_buffer(text), blob_size(text)); |
| 571 | BLOB_APPEND_LITERAL(ob, "\""); |
| 572 | blob_appendf(ob, ">%c%b</span>", cPrefix,text); |
| 573 |
| --- src/markdown_html.c | |
| +++ src/markdown_html.c | |
| @@ -553,20 +553,13 @@ | |
| 553 | BLOB_APPEND_LITERAL(ob, "<span data-"); |
| 554 | switch (type) { |
| 555 | case MKDT_ATREF: |
| 556 | cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break; |
| 557 | case MKDT_HASHTAG: |
| 558 | cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "hashtag"); break; |
| 559 | case MKDT_NUMTAG: |
| 560 | cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "numtag"); break; |
| 561 | } |
| 562 | BLOB_APPEND_LITERAL(ob, "=\""); |
| 563 | html_quote(ob, blob_buffer(text), blob_size(text)); |
| 564 | BLOB_APPEND_LITERAL(ob, "\""); |
| 565 | blob_appendf(ob, ">%c%b</span>", cPrefix,text); |
| 566 |