Fossil SCM

More progress on the markdown #hashtag parsing.

stephan 2021-09-25 08:17 markdown-tagrefs
Commit ac5b66bb40a03f8383a6a61029519e64b0642fadc248ec2c642ff78ab026d266
1 file changed +80 -9
+80 -9
--- src/markdown.c
+++ src/markdown.c
@@ -917,43 +917,114 @@
917917
rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918918
return end;
919919
}
920920
921921
/* char_hashref_tag -- '#' followed by "word" characters to tag
922
- * post numbers, hashtags, etc. */
922
+** post numbers, hashtags, etc.
923
+**
924
+** Basic syntax:
925
+**
926
+** ^[a-zA-Z]X*
927
+**
928
+** Where X is:
929
+**
930
+** - Any number of alphanumeric characters.
931
+**
932
+** - Single underscores. Adjacent underscores are not recognized
933
+** as valid hashtags. That decision is somewhat arbitrary
934
+** and up for debate.
935
+**
936
+** Hashtags must end at the end of input or be followed by whitespace
937
+** or what appears to be the end or separator of a logical
938
+** natural-language construct, e.g. period, colon, etc.
939
+**
940
+** Current limitations of this implementation:
941
+**
942
+** - ASCII only. Support for non-ASCII characters might be
943
+** interesting.
944
+**
945
+** - Currently requires starting alpha and trailing
946
+** alphanumeric or underscores. "Should" be extended to
947
+** handle #X[.Y], where X and optional Y are integer
948
+** values, for forum post references.
949
+*/
923950
static size_t char_hashref_tag(
924951
struct Blob *ob,
925952
struct render *rndr,
926953
char *data,
927954
size_t offset,
928955
size_t size
929956
){
930957
size_t end;
931958
struct Blob work = BLOB_INITIALIZER;
932
-
959
+ int nUscore = 0;
933960
if(offset>0 && !fossil_isspace(data[-1])){
934961
/* Only ever match if the *previous* character is
935
- whitespace or we're at the start of the input. */
962
+ whitespace or we're at the start of the input.
963
+ Note that we rely on fossil processing emphasis
964
+ markup before reaching this function, so *#Hash*
965
+ will Do The Right Thing. */
936966
return 0;
937967
}
938968
assert( '#' == data[0] );
939
- if(size < 2 || !fossil_isalnum(data[1])) return 0;
940
- /*fprintf(stderr,"HASHREF: %.*s\n", (int)size, data);*/
941
- for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end);
969
+ if(size < 2 || !fossil_isalpha(data[1])) return 0;
970
+#if 0
971
+ fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
972
+ (int)offset, (int)size, (int)size, data);
973
+#endif
974
+#define HASHTAG_LEGAL_END \
975
+ case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?'
976
+ for(end = 2; end < size; ++end){
977
+ char ch = data[end];
978
+ switch(ch){
979
+ case '_':
980
+ /* Multiple adjacent underscores not permitted. */
981
+ if(++nUscore>1) goto hashref_bailout;
982
+ break;
983
+ HASHTAG_LEGAL_END:
984
+ if(end<3) goto hashref_bailout/*require 2+ characters (arbitrary)*/;
985
+ ch = 0;
986
+ break;
987
+ default:
988
+ if(!fossil_isalnum(ch)) goto hashref_bailout;
989
+ nUscore = 0;
990
+ break;
991
+ }
992
+ if(ch) continue;
993
+ else break;
994
+ }
995
+#if 0
996
+ fprintf(stderr,"?HASHREF length=%d: %.*s\n",
997
+ (int)end, (int)end, data);
998
+#endif
942999
/*TODO: in order to support detection of forum post-style
9431000
references, we need to recognize #X.Y, but only when X and Y are
9441001
both purely numeric and Y ends on a word/sentence
9451002
boundary.*/
9461003
if(end<size){
947
- /* Only match if we end at a dot or space or end of input */
948
- if(data[end]!='.' && !fossil_isspace(data[end])){
949
- return 0;
1004
+ /* Only match if we end at end of input or what "might" be the end
1005
+ of a natural language grammar construct, e.g. period or
1006
+ [semi]colon. */
1007
+ switch(data[end]){
1008
+ HASHTAG_LEGAL_END:
1009
+ /* We could arguably treat any leading multi-byte character as
1010
+ valid here. */
1011
+ break;
1012
+ default:
1013
+ goto hashref_bailout;
9501014
}
9511015
}
9521016
blob_init(&work, data + 1, end - 1);
9531017
rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
9541018
return end;
1019
+ hashref_bailout:
1020
+#if 0
1021
+ fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n",
1022
+ (int)end, (int)end, data, (int)size, data);
1023
+#endif
1024
+#undef HASHTAG_LEGAL_END
1025
+ return 0;
9551026
}
9561027
9571028
9581029
/* char_langle_tag -- '<' when tags or autolinks are allowed */
9591030
static size_t char_langle_tag(
9601031
--- src/markdown.c
+++ src/markdown.c
@@ -917,43 +917,114 @@
917 rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918 return end;
919 }
920
921 /* char_hashref_tag -- '#' followed by "word" characters to tag
922 * post numbers, hashtags, etc. */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923 static size_t char_hashref_tag(
924 struct Blob *ob,
925 struct render *rndr,
926 char *data,
927 size_t offset,
928 size_t size
929 ){
930 size_t end;
931 struct Blob work = BLOB_INITIALIZER;
932
933 if(offset>0 && !fossil_isspace(data[-1])){
934 /* Only ever match if the *previous* character is
935 whitespace or we're at the start of the input. */
 
 
 
936 return 0;
937 }
938 assert( '#' == data[0] );
939 if(size < 2 || !fossil_isalnum(data[1])) return 0;
940 /*fprintf(stderr,"HASHREF: %.*s\n", (int)size, data);*/
941 for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942 /*TODO: in order to support detection of forum post-style
943 references, we need to recognize #X.Y, but only when X and Y are
944 both purely numeric and Y ends on a word/sentence
945 boundary.*/
946 if(end<size){
947 /* Only match if we end at a dot or space or end of input */
948 if(data[end]!='.' && !fossil_isspace(data[end])){
949 return 0;
 
 
 
 
 
 
 
950 }
951 }
952 blob_init(&work, data + 1, end - 1);
953 rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
954 return end;
 
 
 
 
 
 
 
955 }
956
957
958 /* char_langle_tag -- '<' when tags or autolinks are allowed */
959 static size_t char_langle_tag(
960
--- src/markdown.c
+++ src/markdown.c
@@ -917,43 +917,114 @@
917 rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918 return end;
919 }
920
921 /* char_hashref_tag -- '#' followed by "word" characters to tag
922 ** post numbers, hashtags, etc.
923 **
924 ** Basic syntax:
925 **
926 ** ^[a-zA-Z]X*
927 **
928 ** Where X is:
929 **
930 ** - Any number of alphanumeric characters.
931 **
932 ** - Single underscores. Adjacent underscores are not recognized
933 ** as valid hashtags. That decision is somewhat arbitrary
934 ** and up for debate.
935 **
936 ** Hashtags must end at the end of input or be followed by whitespace
937 ** or what appears to be the end or separator of a logical
938 ** natural-language construct, e.g. period, colon, etc.
939 **
940 ** Current limitations of this implementation:
941 **
942 ** - ASCII only. Support for non-ASCII characters might be
943 ** interesting.
944 **
945 ** - Currently requires starting alpha and trailing
946 ** alphanumeric or underscores. "Should" be extended to
947 ** handle #X[.Y], where X and optional Y are integer
948 ** values, for forum post references.
949 */
950 static size_t char_hashref_tag(
951 struct Blob *ob,
952 struct render *rndr,
953 char *data,
954 size_t offset,
955 size_t size
956 ){
957 size_t end;
958 struct Blob work = BLOB_INITIALIZER;
959 int nUscore = 0;
960 if(offset>0 && !fossil_isspace(data[-1])){
961 /* Only ever match if the *previous* character is
962 whitespace or we're at the start of the input.
963 Note that we rely on fossil processing emphasis
964 markup before reaching this function, so *#Hash*
965 will Do The Right Thing. */
966 return 0;
967 }
968 assert( '#' == data[0] );
969 if(size < 2 || !fossil_isalpha(data[1])) return 0;
970 #if 0
971 fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
972 (int)offset, (int)size, (int)size, data);
973 #endif
974 #define HASHTAG_LEGAL_END \
975 case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?'
976 for(end = 2; end < size; ++end){
977 char ch = data[end];
978 switch(ch){
979 case '_':
980 /* Multiple adjacent underscores not permitted. */
981 if(++nUscore>1) goto hashref_bailout;
982 break;
983 HASHTAG_LEGAL_END:
984 if(end<3) goto hashref_bailout/*require 2+ characters (arbitrary)*/;
985 ch = 0;
986 break;
987 default:
988 if(!fossil_isalnum(ch)) goto hashref_bailout;
989 nUscore = 0;
990 break;
991 }
992 if(ch) continue;
993 else break;
994 }
995 #if 0
996 fprintf(stderr,"?HASHREF length=%d: %.*s\n",
997 (int)end, (int)end, data);
998 #endif
999 /*TODO: in order to support detection of forum post-style
1000 references, we need to recognize #X.Y, but only when X and Y are
1001 both purely numeric and Y ends on a word/sentence
1002 boundary.*/
1003 if(end<size){
1004 /* Only match if we end at end of input or what "might" be the end
1005 of a natural language grammar construct, e.g. period or
1006 [semi]colon. */
1007 switch(data[end]){
1008 HASHTAG_LEGAL_END:
1009 /* We could arguably treat any leading multi-byte character as
1010 valid here. */
1011 break;
1012 default:
1013 goto hashref_bailout;
1014 }
1015 }
1016 blob_init(&work, data + 1, end - 1);
1017 rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
1018 return end;
1019 hashref_bailout:
1020 #if 0
1021 fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.*s] of\n[%.*s]\n",
1022 (int)end, (int)end, data, (int)size, data);
1023 #endif
1024 #undef HASHTAG_LEGAL_END
1025 return 0;
1026 }
1027
1028
1029 /* char_langle_tag -- '<' when tags or autolinks are allowed */
1030 static size_t char_langle_tag(
1031

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button