Fossil SCM

reformatted invalid_utf8 to make it conform a bit better to existing style

sdr 2016-06-11 00:41 invalid_utf8_table

Commit dd3bb22cd726f26e785776d50f605a469a03870d

Parent 4f906e5357b33e6…

1 file changed +9 -19

M src/lookslike.c

+9 -19

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -143,12 +143,11 @@
143	143	** Java and Tcl use it. This function also considers valid
144	144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	145	** wikipedia article referenced previously).
146	146	*/
147	147
148		-int invalid_utf8(const Blob *pContent)
149		-{
	148	+int invalid_utf8(const Blob *pContent) {
150	149	/* definitions for various utf-8 sequence lengths */
151	150	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
152	151	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
153	152	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
154	153	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
		@@ -164,18 +163,16 @@
164	163
165	164	/* a pointer to the table; NULL means not yet setup */
166	165	static unsigned char** lb_ptr = NULL;
167	166
168	167	/* if the table pointer hasn't been initialized */
169		- if (lb_ptr == NULL)
170		- {
	168	+ if (lb_ptr == NULL) {
171	169	lb_ptr = lb_tab;
172	170
173	171	/* for each definition, set the lead byte table pointer to the proper definition */
174	172	unsigned char** pp = def_arr;
175		- while (*pp != NULL)
176		- {
	173	+ while (*pp != NULL) {
177	174	unsigned char lo = pp[0][1];
178	175	unsigned char hi = pp[0][2];
179	176	unsigned char i;
180	177	for (i = lo; i <= hi; ++i)
181	178	lb_ptr[i] = pp[0];
		@@ -186,45 +183,38 @@
186	183	/* buffer pointer and size */
187	184	const unsigned char z = (unsigned char )blob_buffer(pContent);
188	185	unsigned int n = blob_size(pContent);
189	186
190	187	/* while we haven't checked all the bytes in the buffer */
191		- while (n > 0)
192		- {
	188	+ while (n > 0) {
	189	+
193	190	/* ascii is trivial */
194		- if (*z < 0x80)
195		- {
	191	+ if (*z < 0x80) {
196	192	++z;
197	193	--n;
198		- }
199		- else
200		- {
	194	+ } else {
201	195	/* get the definition for this lead byte */
202	196	unsigned char* def = lb_ptr[*z++];
203	197	unsigned char i, len;
204	198
205	199	/* if the definition doesn't exist, return invalid */
206		- if (!def)
207		- return LOOK_INVALID;
	200	+ if (!def) return LOOK_INVALID;
208	201
209	202	/* get the expected sequence length */
210	203	len = *def;
211	204
212	205	/* if there aren't enough bytes left, return invalid */
213		- if (n < len)
214		- return LOOK_INVALID;
	206	+ if (n < len) return LOOK_INVALID;
215	207
216	208	/* skip the length & lead byte range */
217	209	def += 3;
218	210
219	211	/* we already know byte #0 is good, so check the remaining bytes */
220	212	for (i = 1; i < len; ++i)
221		- {
222	213	/* if the byte is outside the allowed range for this definition, return invalid */
223	214	if ((z < def++) \|\| (z++ > def++))
224	215	return LOOK_INVALID;
225		- }
226	216
227	217	/* advance to the next sequence */
228	218	n -= len;
229	219	}
230	220	}
231	221

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,12 +143,11 @@
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).
146	*/
147
148	int invalid_utf8(const Blob *pContent)
149	{
150	/* definitions for various utf-8 sequence lengths */
151	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
152	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
153	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
154	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
	@@ -164,18 +163,16 @@
164
165	/* a pointer to the table; NULL means not yet setup */
166	static unsigned char** lb_ptr = NULL;
167
168	/* if the table pointer hasn't been initialized */
169	if (lb_ptr == NULL)
170	{
171	lb_ptr = lb_tab;
172
173	/* for each definition, set the lead byte table pointer to the proper definition */
174	unsigned char** pp = def_arr;
175	while (*pp != NULL)
176	{
177	unsigned char lo = pp[0][1];
178	unsigned char hi = pp[0][2];
179	unsigned char i;
180	for (i = lo; i <= hi; ++i)
181	lb_ptr[i] = pp[0];
	@@ -186,45 +183,38 @@
186	/* buffer pointer and size */
187	const unsigned char z = (unsigned char )blob_buffer(pContent);
188	unsigned int n = blob_size(pContent);
189
190	/* while we haven't checked all the bytes in the buffer */
191	while (n > 0)
192	{
193	/* ascii is trivial */
194	if (*z < 0x80)
195	{
196	++z;
197	--n;
198	}
199	else
200	{
201	/* get the definition for this lead byte */
202	unsigned char* def = lb_ptr[*z++];
203	unsigned char i, len;
204
205	/* if the definition doesn't exist, return invalid */
206	if (!def)
207	return LOOK_INVALID;
208
209	/* get the expected sequence length */
210	len = *def;
211
212	/* if there aren't enough bytes left, return invalid */
213	if (n < len)
214	return LOOK_INVALID;
215
216	/* skip the length & lead byte range */
217	def += 3;
218
219	/* we already know byte #0 is good, so check the remaining bytes */
220	for (i = 1; i < len; ++i)
221	{
222	/* if the byte is outside the allowed range for this definition, return invalid */
223	if ((z < def++) \|\| (z++ > def++))
224	return LOOK_INVALID;
225	}
226
227	/* advance to the next sequence */
228	n -= len;
229	}
230	}
231

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,12 +143,11 @@
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).
146	*/
147
148	int invalid_utf8(const Blob *pContent) {

149	/* definitions for various utf-8 sequence lengths */
150	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
151	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
152	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
153	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
	@@ -164,18 +163,16 @@
163
164	/* a pointer to the table; NULL means not yet setup */
165	static unsigned char** lb_ptr = NULL;
166
167	/* if the table pointer hasn't been initialized */
168	if (lb_ptr == NULL) {

169	lb_ptr = lb_tab;
170
171	/* for each definition, set the lead byte table pointer to the proper definition */
172	unsigned char** pp = def_arr;
173	while (*pp != NULL) {

174	unsigned char lo = pp[0][1];
175	unsigned char hi = pp[0][2];
176	unsigned char i;
177	for (i = lo; i <= hi; ++i)
178	lb_ptr[i] = pp[0];
	@@ -186,45 +183,38 @@
183	/* buffer pointer and size */
184	const unsigned char z = (unsigned char )blob_buffer(pContent);
185	unsigned int n = blob_size(pContent);
186
187	/* while we haven't checked all the bytes in the buffer */
188	while (n > 0) {
189
190	/* ascii is trivial */
191	if (*z < 0x80) {

192	++z;
193	--n;
194	} else {


195	/* get the definition for this lead byte */
196	unsigned char* def = lb_ptr[*z++];
197	unsigned char i, len;
198
199	/* if the definition doesn't exist, return invalid */
200	if (!def) return LOOK_INVALID;

201
202	/* get the expected sequence length */
203	len = *def;
204
205	/* if there aren't enough bytes left, return invalid */
206	if (n < len) return LOOK_INVALID;

207
208	/* skip the length & lead byte range */
209	def += 3;
210
211	/* we already know byte #0 is good, so check the remaining bytes */
212	for (i = 1; i < len; ++i)

213	/* if the byte is outside the allowed range for this definition, return invalid */
214	if ((z < def++) \|\| (z++ > def++))
215	return LOOK_INVALID;

216
217	/* advance to the next sequence */
218	n -= len;
219	}
220	}
221

Fossil SCM

Keyboard Shortcuts