Fossil SCM

Any non-ASCII characters are now considered valid for use in hashtags.

stephan 2021-10-05 17:31 markdown-tagrefs
Commit 4f68a1306f7236f58ccd8ee24e0eb5a609eb7f0f30edf60d73320c6a848b6c41
2 files changed +37 -15 +3 -10
+37 -15
--- src/markdown.c
+++ src/markdown.c
@@ -43,11 +43,12 @@
4343
};
4444
4545
/* mkd_tagspan -- type of tagged <span> */
4646
enum mkd_tagspan {
4747
MKDT_ATREF, /* @name references, as in /chat attention targeting */
48
- MKDT_HASHTAG, /* #hash tags, message IDs, etc. */
48
+ MKDT_HASHTAG, /* #hashtags */
49
+ MKDT_NUMTAG /* #123[.456] /chat or /forum message IDs. */
4950
};
5051
5152
/* mkd_renderer -- functions for rendering parsed data */
5253
struct mkd_renderer {
5354
/* document level callbacks */
@@ -954,11 +955,11 @@
954955
size_t offset,
955956
size_t size
956957
){
957958
size_t end;
958959
struct Blob work = BLOB_INITIALIZER;
959
- int nUscore = 0; /* Consecutive underscore counter */;
960
+ int nUscore = 0; /* Consecutive underscore counter */
960961
int numberMode = 0 /* 0 for normal, 1 for #NNN numeric,
961962
and 2 for #NNN.NNN. */;
962963
if(offset>0 && !fossil_isspace(data[-1])){
963964
/* Only ever match if the *previous* character is whitespace or
964965
we're at the start of the input. Note that we rely on fossil
@@ -967,56 +968,75 @@
967968
"#Hash." will match while ".#Hash" won't. That's okay. */
968969
return 0;
969970
}
970971
assert( '#' == data[0] );
971972
if(size < 2) return 0;
973
+ end = 2;
972974
if(fossil_isdigit(data[1])){
973975
numberMode = 1;
974976
}else if(!fossil_isalpha(data[1])){
975
- return 0;
977
+ switch(data[1] & 0xF0){
978
+ /* Reminder: UTF8 char lengths can be determined by
979
+ masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2,
980
+ else 1. */
981
+ case 0xF0: end+=3; break;
982
+ case 0xE0: end+=2; break;
983
+ case 0xC0: end+=1; break;
984
+ default: return 0;
985
+ }
976986
}
977987
#if 0
978988
fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
979989
(int)offset, (int)size, (int)size, data);
980990
#endif
981991
#define HASHTAG_LEGAL_END \
982992
case ' ': case '\t': case '\r': case '\n': \
983993
case ':': case ';': case '!': case '?': case ','
984994
/* ^^^^ '.' is handled separately */
985
- for(end = 2; end < size; ++end){
995
+ for(; end < size; ++end){
986996
char ch = data[end];
987
- /* Potential TODO: if (ch & 0xF0), treat it as valid, skip that
988
- multi-byte character's length characters, and continue
989
- looping. Reminder: UTF8 char lengths can be determined by
990
- masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, else 1. */
997
+ switch(ch & 0xF0){
998
+ case 0xF0: end+=3; continue;
999
+ case 0xE0: end+=2; continue;
1000
+ case 0xC0: end+=1; continue;
1001
+ case 0x80: goto hashref_bailout /*invalid UTF8*/;
1002
+ default: break;
1003
+ }
1004
+#if 0
1005
+ fprintf(stderr,"hashtag? checking... length=%d: %.*s\n",
1006
+ (int)end, (int)end, data);
1007
+#endif
9911008
switch(ch){
9921009
case '_':
9931010
/* Multiple adjacent underscores not permitted. */
994
- if(numberMode>0 || ++nUscore>1) goto hashref_bailout;
1011
+ if(++nUscore>1) goto hashref_bailout;
1012
+ numberMode = 0;
9951013
break;
9961014
case '.':
9971015
if(1==numberMode) ++numberMode;
9981016
ch = 0;
9991017
break;
10001018
HASHTAG_LEGAL_END:
1001
- if(numberMode==0 && end<3){
1002
- goto hashref_bailout/*require 2+ characters (arbitrary)*/;
1003
- }
10041019
ch = 0;
10051020
break;
10061021
case '0': case '1': case '2': case '3': case '4':
10071022
case '5': case '6': case '7': case '8': case '9':
1023
+ nUscore = 0;
10081024
break;
10091025
default:
1010
- if(numberMode!=0 || !fossil_isalpha(ch)){
1026
+ if(numberMode || !fossil_isalpha(ch)){
10111027
goto hashref_bailout;
10121028
}
10131029
nUscore = 0;
10141030
break;
10151031
}
10161032
if(ch) continue;
10171033
break;
1034
+ }
1035
+ if((end<3/* #. or some such */ && !numberMode)
1036
+ || end>size/*from truncated multi-byte char*/){
1037
+ return 0;
10181038
}
10191039
if(numberMode>1){
10201040
/* Check for trailing part of #NNN.nnn... */
10211041
assert('.'==data[end]);
10221042
if(end<size-1 && fossil_isdigit(data[end+1])){
@@ -1024,11 +1044,11 @@
10241044
if(!fossil_isdigit(data[end])) break;
10251045
}
10261046
}
10271047
}
10281048
#if 0
1029
- fprintf(stderr,"?HASHREF length=%d: %.*s\n",
1049
+ fprintf(stderr,"???HASHREF length=%d: %.*s\n",
10301050
(int)end, (int)end, data);
10311051
#endif
10321052
if(end<size){
10331053
/* Only match if we end at end of input or what "might" be the end
10341054
of a natural language grammar construct, e.g. period or
@@ -1040,11 +1060,13 @@
10401060
default:
10411061
goto hashref_bailout;
10421062
}
10431063
}
10441064
blob_init(&work, data + 1, end - 1);
1045
- rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
1065
+ rndr->make.tagspan(ob, &work,
1066
+ numberMode ? MKDT_NUMTAG : MKDT_HASHTAG,
1067
+ rndr->make.opaque);
10461068
return end;
10471069
hashref_bailout:
10481070
#if 0
10491071
fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n",
10501072
(int)end, (int)end, data, (int)size, data);
10511073
--- src/markdown.c
+++ src/markdown.c
@@ -43,11 +43,12 @@
43 };
44
45 /* mkd_tagspan -- type of tagged <span> */
46 enum mkd_tagspan {
47 MKDT_ATREF, /* @name references, as in /chat attention targeting */
48 MKDT_HASHTAG, /* #hash tags, message IDs, etc. */
 
49 };
50
51 /* mkd_renderer -- functions for rendering parsed data */
52 struct mkd_renderer {
53 /* document level callbacks */
@@ -954,11 +955,11 @@
954 size_t offset,
955 size_t size
956 ){
957 size_t end;
958 struct Blob work = BLOB_INITIALIZER;
959 int nUscore = 0; /* Consecutive underscore counter */;
960 int numberMode = 0 /* 0 for normal, 1 for #NNN numeric,
961 and 2 for #NNN.NNN. */;
962 if(offset>0 && !fossil_isspace(data[-1])){
963 /* Only ever match if the *previous* character is whitespace or
964 we're at the start of the input. Note that we rely on fossil
@@ -967,56 +968,75 @@
967 "#Hash." will match while ".#Hash" won't. That's okay. */
968 return 0;
969 }
970 assert( '#' == data[0] );
971 if(size < 2) return 0;
 
972 if(fossil_isdigit(data[1])){
973 numberMode = 1;
974 }else if(!fossil_isalpha(data[1])){
975 return 0;
 
 
 
 
 
 
 
 
976 }
977 #if 0
978 fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
979 (int)offset, (int)size, (int)size, data);
980 #endif
981 #define HASHTAG_LEGAL_END \
982 case ' ': case '\t': case '\r': case '\n': \
983 case ':': case ';': case '!': case '?': case ','
984 /* ^^^^ '.' is handled separately */
985 for(end = 2; end < size; ++end){
986 char ch = data[end];
987 /* Potential TODO: if (ch & 0xF0), treat it as valid, skip that
988 multi-byte character's length characters, and continue
989 looping. Reminder: UTF8 char lengths can be determined by
990 masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2, else 1. */
 
 
 
 
 
 
 
991 switch(ch){
992 case '_':
993 /* Multiple adjacent underscores not permitted. */
994 if(numberMode>0 || ++nUscore>1) goto hashref_bailout;
 
995 break;
996 case '.':
997 if(1==numberMode) ++numberMode;
998 ch = 0;
999 break;
1000 HASHTAG_LEGAL_END:
1001 if(numberMode==0 && end<3){
1002 goto hashref_bailout/*require 2+ characters (arbitrary)*/;
1003 }
1004 ch = 0;
1005 break;
1006 case '0': case '1': case '2': case '3': case '4':
1007 case '5': case '6': case '7': case '8': case '9':
 
1008 break;
1009 default:
1010 if(numberMode!=0 || !fossil_isalpha(ch)){
1011 goto hashref_bailout;
1012 }
1013 nUscore = 0;
1014 break;
1015 }
1016 if(ch) continue;
1017 break;
 
 
 
 
1018 }
1019 if(numberMode>1){
1020 /* Check for trailing part of #NNN.nnn... */
1021 assert('.'==data[end]);
1022 if(end<size-1 && fossil_isdigit(data[end+1])){
@@ -1024,11 +1044,11 @@
1024 if(!fossil_isdigit(data[end])) break;
1025 }
1026 }
1027 }
1028 #if 0
1029 fprintf(stderr,"?HASHREF length=%d: %.*s\n",
1030 (int)end, (int)end, data);
1031 #endif
1032 if(end<size){
1033 /* Only match if we end at end of input or what "might" be the end
1034 of a natural language grammar construct, e.g. period or
@@ -1040,11 +1060,13 @@
1040 default:
1041 goto hashref_bailout;
1042 }
1043 }
1044 blob_init(&work, data + 1, end - 1);
1045 rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
 
 
1046 return end;
1047 hashref_bailout:
1048 #if 0
1049 fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n",
1050 (int)end, (int)end, data, (int)size, data);
1051
--- src/markdown.c
+++ src/markdown.c
@@ -43,11 +43,12 @@
43 };
44
45 /* mkd_tagspan -- type of tagged <span> */
46 enum mkd_tagspan {
47 MKDT_ATREF, /* @name references, as in /chat attention targeting */
48 MKDT_HASHTAG, /* #hashtags */
49 MKDT_NUMTAG /* #123[.456] /chat or /forum message IDs. */
50 };
51
52 /* mkd_renderer -- functions for rendering parsed data */
53 struct mkd_renderer {
54 /* document level callbacks */
@@ -954,11 +955,11 @@
955 size_t offset,
956 size_t size
957 ){
958 size_t end;
959 struct Blob work = BLOB_INITIALIZER;
960 int nUscore = 0; /* Consecutive underscore counter */
961 int numberMode = 0 /* 0 for normal, 1 for #NNN numeric,
962 and 2 for #NNN.NNN. */;
963 if(offset>0 && !fossil_isspace(data[-1])){
964 /* Only ever match if the *previous* character is whitespace or
965 we're at the start of the input. Note that we rely on fossil
@@ -967,56 +968,75 @@
968 "#Hash." will match while ".#Hash" won't. That's okay. */
969 return 0;
970 }
971 assert( '#' == data[0] );
972 if(size < 2) return 0;
973 end = 2;
974 if(fossil_isdigit(data[1])){
975 numberMode = 1;
976 }else if(!fossil_isalpha(data[1])){
977 switch(data[1] & 0xF0){
978 /* Reminder: UTF8 char lengths can be determined by
979 masking against 0xF0: 0xf0==4, 0xe0==3, 0xc0==2,
980 else 1. */
981 case 0xF0: end+=3; break;
982 case 0xE0: end+=2; break;
983 case 0xC0: end+=1; break;
984 default: return 0;
985 }
986 }
987 #if 0
988 fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
989 (int)offset, (int)size, (int)size, data);
990 #endif
991 #define HASHTAG_LEGAL_END \
992 case ' ': case '\t': case '\r': case '\n': \
993 case ':': case ';': case '!': case '?': case ','
994 /* ^^^^ '.' is handled separately */
995 for(; end < size; ++end){
996 char ch = data[end];
997 switch(ch & 0xF0){
998 case 0xF0: end+=3; continue;
999 case 0xE0: end+=2; continue;
1000 case 0xC0: end+=1; continue;
1001 case 0x80: goto hashref_bailout /*invalid UTF8*/;
1002 default: break;
1003 }
1004 #if 0
1005 fprintf(stderr,"hashtag? checking... length=%d: %.*s\n",
1006 (int)end, (int)end, data);
1007 #endif
1008 switch(ch){
1009 case '_':
1010 /* Multiple adjacent underscores not permitted. */
1011 if(++nUscore>1) goto hashref_bailout;
1012 numberMode = 0;
1013 break;
1014 case '.':
1015 if(1==numberMode) ++numberMode;
1016 ch = 0;
1017 break;
1018 HASHTAG_LEGAL_END:
 
 
 
1019 ch = 0;
1020 break;
1021 case '0': case '1': case '2': case '3': case '4':
1022 case '5': case '6': case '7': case '8': case '9':
1023 nUscore = 0;
1024 break;
1025 default:
1026 if(numberMode || !fossil_isalpha(ch)){
1027 goto hashref_bailout;
1028 }
1029 nUscore = 0;
1030 break;
1031 }
1032 if(ch) continue;
1033 break;
1034 }
1035 if((end<3/* #. or some such */ && !numberMode)
1036 || end>size/*from truncated multi-byte char*/){
1037 return 0;
1038 }
1039 if(numberMode>1){
1040 /* Check for trailing part of #NNN.nnn... */
1041 assert('.'==data[end]);
1042 if(end<size-1 && fossil_isdigit(data[end+1])){
@@ -1024,11 +1044,11 @@
1044 if(!fossil_isdigit(data[end])) break;
1045 }
1046 }
1047 }
1048 #if 0
1049 fprintf(stderr,"???HASHREF length=%d: %.*s\n",
1050 (int)end, (int)end, data);
1051 #endif
1052 if(end<size){
1053 /* Only match if we end at end of input or what "might" be the end
1054 of a natural language grammar construct, e.g. period or
@@ -1040,11 +1060,13 @@
1060 default:
1061 goto hashref_bailout;
1062 }
1063 }
1064 blob_init(&work, data + 1, end - 1);
1065 rndr->make.tagspan(ob, &work,
1066 numberMode ? MKDT_NUMTAG : MKDT_HASHTAG,
1067 rndr->make.opaque);
1068 return end;
1069 hashref_bailout:
1070 #if 0
1071 fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n",
1072 (int)end, (int)end, data, (int)size, data);
1073
--- src/markdown_html.c
+++ src/markdown_html.c
@@ -553,20 +553,13 @@
553553
BLOB_APPEND_LITERAL(ob, "<span data-");
554554
switch (type) {
555555
case MKDT_ATREF:
556556
cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break;
557557
case MKDT_HASHTAG:
558
- cPrefix = '#';
559
- if(fossil_isdigit(*blob_str(text))){
560
- /* This is a #NNN or #NNN.NNN reference. Mark it differently
561
- because these will be handled differently by higher-level
562
- code than conventional hashtags will. */
563
- BLOB_APPEND_LITERAL(ob, "numtag");
564
- }else{
565
- BLOB_APPEND_LITERAL(ob, "hashtag");
566
- }
567
- break;
558
+ cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "hashtag"); break;
559
+ case MKDT_NUMTAG:
560
+ cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "numtag"); break;
568561
}
569562
BLOB_APPEND_LITERAL(ob, "=\"");
570563
html_quote(ob, blob_buffer(text), blob_size(text));
571564
BLOB_APPEND_LITERAL(ob, "\"");
572565
blob_appendf(ob, ">%c%b</span>", cPrefix,text);
573566
--- src/markdown_html.c
+++ src/markdown_html.c
@@ -553,20 +553,13 @@
553 BLOB_APPEND_LITERAL(ob, "<span data-");
554 switch (type) {
555 case MKDT_ATREF:
556 cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break;
557 case MKDT_HASHTAG:
558 cPrefix = '#';
559 if(fossil_isdigit(*blob_str(text))){
560 /* This is a #NNN or #NNN.NNN reference. Mark it differently
561 because these will be handled differently by higher-level
562 code than conventional hashtags will. */
563 BLOB_APPEND_LITERAL(ob, "numtag");
564 }else{
565 BLOB_APPEND_LITERAL(ob, "hashtag");
566 }
567 break;
568 }
569 BLOB_APPEND_LITERAL(ob, "=\"");
570 html_quote(ob, blob_buffer(text), blob_size(text));
571 BLOB_APPEND_LITERAL(ob, "\"");
572 blob_appendf(ob, ">%c%b</span>", cPrefix,text);
573
--- src/markdown_html.c
+++ src/markdown_html.c
@@ -553,20 +553,13 @@
553 BLOB_APPEND_LITERAL(ob, "<span data-");
554 switch (type) {
555 case MKDT_ATREF:
556 cPrefix = '@'; BLOB_APPEND_LITERAL(ob, "atref"); break;
557 case MKDT_HASHTAG:
558 cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "hashtag"); break;
559 case MKDT_NUMTAG:
560 cPrefix = '#'; BLOB_APPEND_LITERAL(ob, "numtag"); break;
 
 
 
 
 
 
 
561 }
562 BLOB_APPEND_LITERAL(ob, "=\"");
563 html_quote(ob, blob_buffer(text), blob_size(text));
564 BLOB_APPEND_LITERAL(ob, "\"");
565 blob_appendf(ob, ">%c%b</span>", cPrefix,text);
566

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button