Fossil SCM

More progress on the markdown #hashtag parsing.

stephan 2021-09-25 08:17 markdown-tagrefs

Commit ac5b66bb40a03f8383a6a61029519e64b0642fadc248ec2c642ff78ab026d266

Parent 5618cceb7bf9e8c…

1 file changed +80 -9

M src/markdown.c

+80 -9

		--- src/markdown.c
		+++ src/markdown.c
		@@ -917,43 +917,114 @@
917	917	rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918	918	return end;
919	919	}
920	920
921	921	/* char_hashref_tag -- '#' followed by "word" characters to tag
922		- * post numbers, hashtags, etc. */
	922	+** post numbers, hashtags, etc.
	923	+**
	924	+** Basic syntax:
	925	+**
	926	+** ^[a-zA-Z]X*
	927	+**
	928	+** Where X is:
	929	+**
	930	+** - Any number of alphanumeric characters.
	931	+**
	932	+** - Single underscores. Adjacent underscores are not recognized
	933	+** as valid hashtags. That decision is somewhat arbitrary
	934	+** and up for debate.
	935	+**
	936	+** Hashtags must end at the end of input or be followed by whitespace
	937	+** or what appears to be the end or separator of a logical
	938	+** natural-language construct, e.g. period, colon, etc.
	939	+**
	940	+** Current limitations of this implementation:
	941	+**
	942	+** - ASCII only. Support for non-ASCII characters might be
	943	+** interesting.
	944	+**
	945	+** - Currently requires starting alpha and trailing
	946	+** alphanumeric or underscores. "Should" be extended to
	947	+** handle #X[.Y], where X and optional Y are integer
	948	+** values, for forum post references.
	949	+*/
923	950	static size_t char_hashref_tag(
924	951	struct Blob *ob,
925	952	struct render *rndr,
926	953	char *data,
927	954	size_t offset,
928	955	size_t size
929	956	){
930	957	size_t end;
931	958	struct Blob work = BLOB_INITIALIZER;
932		-
	959	+ int nUscore = 0;
933	960	if(offset>0 && !fossil_isspace(data[-1])){
934	961	/* Only ever match if the previous character is
935		- whitespace or we're at the start of the input. */
	962	+ whitespace or we're at the start of the input.
	963	+ Note that we rely on fossil processing emphasis
	964	+ markup before reaching this function, so #Hash
	965	+ will Do The Right Thing. */
936	966	return 0;
937	967	}
938	968	assert( '#' == data[0] );
939		- if(size < 2 \|\| !fossil_isalnum(data[1])) return 0;
940		- /fprintf(stderr,"HASHREF: %.s\n", (int)size, data);*/
941		- for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end);
	969	+ if(size < 2 \|\| !fossil_isalpha(data[1])) return 0;
	970	+#if 0
	971	+ fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
	972	+ (int)offset, (int)size, (int)size, data);
	973	+#endif
	974	+#define HASHTAG_LEGAL_END \
	975	+ case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?'
	976	+ for(end = 2; end < size; ++end){
	977	+ char ch = data[end];
	978	+ switch(ch){
	979	+ case '_':
	980	+ /* Multiple adjacent underscores not permitted. */
	981	+ if(++nUscore>1) goto hashref_bailout;
	982	+ break;
	983	+ HASHTAG_LEGAL_END:
	984	+ if(end<3) goto hashref_bailout/require 2+ characters (arbitrary)/;
	985	+ ch = 0;
	986	+ break;
	987	+ default:
	988	+ if(!fossil_isalnum(ch)) goto hashref_bailout;
	989	+ nUscore = 0;
	990	+ break;
	991	+ }
	992	+ if(ch) continue;
	993	+ else break;
	994	+ }
	995	+#if 0
	996	+ fprintf(stderr,"?HASHREF length=%d: %.*s\n",
	997	+ (int)end, (int)end, data);
	998	+#endif
942	999	/*TODO: in order to support detection of forum post-style
943	1000	references, we need to recognize #X.Y, but only when X and Y are
944	1001	both purely numeric and Y ends on a word/sentence
945	1002	boundary.*/
946	1003	if(end<size){
947		- /* Only match if we end at a dot or space or end of input */
948		- if(data[end]!='.' && !fossil_isspace(data[end])){
949		- return 0;
	1004	+ /* Only match if we end at end of input or what "might" be the end
	1005	+ of a natural language grammar construct, e.g. period or
	1006	+ [semi]colon. */
	1007	+ switch(data[end]){
	1008	+ HASHTAG_LEGAL_END:
	1009	+ /* We could arguably treat any leading multi-byte character as
	1010	+ valid here. */
	1011	+ break;
	1012	+ default:
	1013	+ goto hashref_bailout;
950	1014	}
951	1015	}
952	1016	blob_init(&work, data + 1, end - 1);
953	1017	rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
954	1018	return end;
	1019	+ hashref_bailout:
	1020	+#if 0
	1021	+ fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.s] of\n[%.s]\n",
	1022	+ (int)end, (int)end, data, (int)size, data);
	1023	+#endif
	1024	+#undef HASHTAG_LEGAL_END
	1025	+ return 0;
955	1026	}
956	1027
957	1028
958	1029	/* char_langle_tag -- '<' when tags or autolinks are allowed */
959	1030	static size_t char_langle_tag(
960	1031

	--- src/markdown.c
	+++ src/markdown.c
	@@ -917,43 +917,114 @@
917	rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918	return end;
919	}
920
921	/* char_hashref_tag -- '#' followed by "word" characters to tag
922	* post numbers, hashtags, etc. */



























923	static size_t char_hashref_tag(
924	struct Blob *ob,
925	struct render *rndr,
926	char *data,
927	size_t offset,
928	size_t size
929	){
930	size_t end;
931	struct Blob work = BLOB_INITIALIZER;
932
933	if(offset>0 && !fossil_isspace(data[-1])){
934	/* Only ever match if the previous character is
935	whitespace or we're at the start of the input. */



936	return 0;
937	}
938	assert( '#' == data[0] );
939	if(size < 2 \|\| !fossil_isalnum(data[1])) return 0;
940	/fprintf(stderr,"HASHREF: %.s\n", (int)size, data);*/
941	for (end = 2; (end < size) && fossil_isalnum(data[end]); ++end);



























942	/*TODO: in order to support detection of forum post-style
943	references, we need to recognize #X.Y, but only when X and Y are
944	both purely numeric and Y ends on a word/sentence
945	boundary.*/
946	if(end<size){
947	/* Only match if we end at a dot or space or end of input */
948	if(data[end]!='.' && !fossil_isspace(data[end])){
949	return 0;







950	}
951	}
952	blob_init(&work, data + 1, end - 1);
953	rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
954	return end;







955	}
956
957
958	/* char_langle_tag -- '<' when tags or autolinks are allowed */
959	static size_t char_langle_tag(
960

	--- src/markdown.c
	+++ src/markdown.c
	@@ -917,43 +917,114 @@
917	rndr->make.tagspan(ob, &work, MKDT_ATREF, rndr->make.opaque);
918	return end;
919	}
920
921	/* char_hashref_tag -- '#' followed by "word" characters to tag
922	** post numbers, hashtags, etc.
923	**
924	** Basic syntax:
925	**
926	** ^[a-zA-Z]X*
927	**
928	** Where X is:
929	**
930	** - Any number of alphanumeric characters.
931	**
932	** - Single underscores. Adjacent underscores are not recognized
933	** as valid hashtags. That decision is somewhat arbitrary
934	** and up for debate.
935	**
936	** Hashtags must end at the end of input or be followed by whitespace
937	** or what appears to be the end or separator of a logical
938	** natural-language construct, e.g. period, colon, etc.
939	**
940	** Current limitations of this implementation:
941	**
942	** - ASCII only. Support for non-ASCII characters might be
943	** interesting.
944	**
945	** - Currently requires starting alpha and trailing
946	** alphanumeric or underscores. "Should" be extended to
947	** handle #X[.Y], where X and optional Y are integer
948	** values, for forum post references.
949	*/
950	static size_t char_hashref_tag(
951	struct Blob *ob,
952	struct render *rndr,
953	char *data,
954	size_t offset,
955	size_t size
956	){
957	size_t end;
958	struct Blob work = BLOB_INITIALIZER;
959	int nUscore = 0;
960	if(offset>0 && !fossil_isspace(data[-1])){
961	/* Only ever match if the previous character is
962	whitespace or we're at the start of the input.
963	Note that we rely on fossil processing emphasis
964	markup before reaching this function, so #Hash
965	will Do The Right Thing. */
966	return 0;
967	}
968	assert( '#' == data[0] );
969	if(size < 2 \|\| !fossil_isalpha(data[1])) return 0;
970	#if 0
971	fprintf(stderr,"HASHREF offset=%d size=%d: %.*s\n",
972	(int)offset, (int)size, (int)size, data);
973	#endif
974	#define HASHTAG_LEGAL_END \
975	case ' ': case '\t': case '\r': case '\n': case '.': case ':': case ';': case '!': case '?'
976	for(end = 2; end < size; ++end){
977	char ch = data[end];
978	switch(ch){
979	case '_':
980	/* Multiple adjacent underscores not permitted. */
981	if(++nUscore>1) goto hashref_bailout;
982	break;
983	HASHTAG_LEGAL_END:
984	if(end<3) goto hashref_bailout/require 2+ characters (arbitrary)/;
985	ch = 0;
986	break;
987	default:
988	if(!fossil_isalnum(ch)) goto hashref_bailout;
989	nUscore = 0;
990	break;
991	}
992	if(ch) continue;
993	else break;
994	}
995	#if 0
996	fprintf(stderr,"?HASHREF length=%d: %.*s\n",
997	(int)end, (int)end, data);
998	#endif
999	/*TODO: in order to support detection of forum post-style
1000	references, we need to recognize #X.Y, but only when X and Y are
1001	both purely numeric and Y ends on a word/sentence
1002	boundary.*/
1003	if(end<size){
1004	/* Only match if we end at end of input or what "might" be the end
1005	of a natural language grammar construct, e.g. period or
1006	[semi]colon. */
1007	switch(data[end]){
1008	HASHTAG_LEGAL_END:
1009	/* We could arguably treat any leading multi-byte character as
1010	valid here. */
1011	break;
1012	default:
1013	goto hashref_bailout;
1014	}
1015	}
1016	blob_init(&work, data + 1, end - 1);
1017	rndr->make.tagspan(ob, &work, MKDT_HASHTAG, rndr->make.opaque);
1018	return end;
1019	hashref_bailout:
1020	#if 0
1021	fprintf(stderr,"BAILING HASHREF examined=%d:\n[%.s] of\n[%.s]\n",
1022	(int)end, (int)end, data, (int)size, data);
1023	#endif
1024	#undef HASHTAG_LEGAL_END
1025	return 0;
1026	}
1027
1028
1029	/* char_langle_tag -- '<' when tags or autolinks are allowed */
1030	static size_t char_langle_tag(
1031

Fossil SCM

Keyboard Shortcuts