chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2004 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20
	21	#include <config.h>
	22	#include "types.h"
	23
	24	#include <string.h>
	25	#include <stddef.h>
	26
	27	#include "mem.h"
	28	#include "vector.h"
	29	#include "table.h"
	30	#include "words.h"
	31	#include "utf8.h"
	32
	33	#include "casefold.h"
	34	#include "unicodegc.h"
	35
	36	const char casefold(const char ptr) {
	37	struct dynstr d;
	38	int l, r, m;
	39	uint32_t c;
	40	const struct cm *t;
	41	const char start, s = ptr;
	42
	43	dynstr_init(&d);
	44	while(*s) {
	45	start = s;
	46	PARSE_UTF8(s, c, return ptr);
	47	/* seek the folded equivalent */
	48	t = cm[c & CM_MASK];
	49	l = 0;
	50	r = cmn[c & CM_MASK] - 1;
	51	while(l <= r && c != t[m = (l + r) / 2].ch)
	52	if(c < t[m].ch)
	53	r = m - 1;
	54	else
	55	l = m + 1;
	56	if(l <= r)
	57	dynstr_append_string(&d, t[m].tr);
	58	else
	59	dynstr_append_bytes(&d, start, s - start);
	60	}
	61	dynstr_terminate(&d);
	62	return d.vec;
	63	}
	64
	65	static enum unicode_gc_cat cat(uint32_t c) {
	66	int l, r, m;
	67
	68	l = 0;
	69	r = sizeof gcs / sizeof *gcs;
	70	while(l <= r) {
	71	m = (l + r) / 2;
	72	if(c < gcs[m].l)
	73	r = m - 1;
	74	else if(c > gcs[m].h)
	75	l = m + 1;
	76	else
	77	return gcs[m].cat;
	78	}
	79	return unicode_gc_none;
	80	}
	81
	82	/* XXX this is a bit kludgy */
	83
	84	char *words(const char s, int *nvecp) {
	85	struct vector v;
	86	struct dynstr d;
	87	const char *start;
	88	uint32_t c;
	89	int in_word = 0;
	90
	91	vector_init(&v);
	92	while(*s) {
	93	start = s;
	94	PARSE_UTF8(s, c, return 0);
	95	/* special cases first */
	96	switch(c) {
	97	case '/':
	98	case '.':
	99	case '+':
	100	case '&':
	101	case ':':
	102	case '_':
	103	case '-':
	104	goto separator;
	105	}
	106	/* do the rest on category */
	107	switch(cat(c)) {
	108	case unicode_gc_Ll:
	109	case unicode_gc_Lm:
	110	case unicode_gc_Lo:
	111	case unicode_gc_Lt:
	112	case unicode_gc_Lu:
	113	case unicode_gc_Nd:
	114	case unicode_gc_Nl:
	115	case unicode_gc_No:
	116	case unicode_gc_Sc:
	117	case unicode_gc_Sk:
	118	case unicode_gc_Sm:
	119	case unicode_gc_So:
	120	/* letters, digits and symbols are considered to be part of
	121	* words */
	122	if(!in_word) {
	123	dynstr_init(&d);
	124	in_word = 1;
	125	}
	126	dynstr_append_bytes(&d, start, s - start);
	127	break;
	128
	129	case unicode_gc_Cc:
	130	case unicode_gc_Cf:
	131	case unicode_gc_Co:
	132	case unicode_gc_Cs:
	133	case unicode_gc_Zl:
	134	case unicode_gc_Zp:
	135	case unicode_gc_Zs:
	136	case unicode_gc_Pe:
	137	case unicode_gc_Ps:
	138	separator:
	139	if(in_word) {
	140	dynstr_terminate(&d);
	141	vector_append(&v, d.vec);
	142	in_word = 0;
	143	}
	144	break;
	145
	146	case unicode_gc_Mc:
	147	case unicode_gc_Me:
	148	case unicode_gc_Mn:
	149	case unicode_gc_Pc:
	150	case unicode_gc_Pd:
	151	case unicode_gc_Pf:
	152	case unicode_gc_Pi:
	153	case unicode_gc_Po:
	154	case unicode_gc_none:
	155	/* control and punctuation is completely ignored */
	156	break;
	157
	158	}
	159	}
	160	if(in_word) {
	161	/* pick up the final word */
	162	dynstr_terminate(&d);
	163	vector_append(&v, d.vec);
	164	}
	165	vector_terminate(&v);
	166	if(nvecp)
	167	*nvecp = v.nvec;
	168	return v.vec;
	169	}
	170
	171	/*
	172	Local Variables:
	173	c-basic-offset:2
	174	comment-column:40
	175	End:
	176	*/