chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2004, 2007 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20
	21	#include <config.h>
	22	#include "types.h"
	23
	24	#include <string.h>
	25	#include <stddef.h>
	26
	27	#include "mem.h"
	28	#include "vector.h"
	29	#include "table.h"
	30	#include "words.h"
	31	#include "utf8.h"
	32	#include "log.h"
	33	#include "charset.h"
	34
	35	#include "unidata.h"
	36
	37	const char casefold(const char ptr) {
	38	struct dynstr d;
	39	uint32_t c;
	40	const char *s = ptr;
	41
	42	dynstr_init(&d);
	43	while(*s) {
	44	/* Convert UTF-8 to UCS-32 */
	45	PARSE_UTF8(s, c, return ptr);
	46	/* Normalize */
	47	if(c < UNICODE_NCHARS) {
	48	/* If this a known character, convert it to lower case */
	49	const struct unidata *const ud = &unidata[c / 256][c % 256];
	50	c += ud->lower_offset;
	51	}
	52	/* Convert UCS-4 back to UTF-8 */
	53	one_ucs42utf8(c, &d);
	54	}
	55	dynstr_terminate(&d);
	56	return d.vec;
	57	}
	58
	59	static enum unicode_gc_cat cat(uint32_t c) {
	60	if(c < UNICODE_NCHARS) {
	61	/* If this a known character, convert it to lower case */
	62	const struct unidata *const ud = &unidata[c / 256][c % 256];
	63	return ud->gc;
	64	} else
	65	return unicode_gc_Cn;
	66	}
	67
	68	/* XXX this is a bit kludgy */
	69
	70	char *words(const char s, int *nvecp) {
	71	struct vector v;
	72	struct dynstr d;
	73	const char *start;
	74	uint32_t c;
	75	int in_word = 0;
	76
	77	vector_init(&v);
	78	while(*s) {
	79	start = s;
	80	PARSE_UTF8(s, c, return 0);
	81	/* special cases first */
	82	switch(c) {
	83	case '/':
	84	case '.':
	85	case '+':
	86	case '&':
	87	case ':':
	88	case '_':
	89	case '-':
	90	goto separator;
	91	}
	92	/* do the rest on category */
	93	switch(cat(c)) {
	94	case unicode_gc_Ll:
	95	case unicode_gc_Lm:
	96	case unicode_gc_Lo:
	97	case unicode_gc_Lt:
	98	case unicode_gc_Lu:
	99	case unicode_gc_Nd:
	100	case unicode_gc_Nl:
	101	case unicode_gc_No:
	102	case unicode_gc_Sc:
	103	case unicode_gc_Sk:
	104	case unicode_gc_Sm:
	105	case unicode_gc_So:
	106	/* letters, digits and symbols are considered to be part of
	107	* words */
	108	if(!in_word) {
	109	dynstr_init(&d);
	110	in_word = 1;
	111	}
	112	dynstr_append_bytes(&d, start, s - start);
	113	break;
	114
	115	case unicode_gc_Cc:
	116	case unicode_gc_Cf:
	117	case unicode_gc_Co:
	118	case unicode_gc_Cs:
	119	case unicode_gc_Zl:
	120	case unicode_gc_Zp:
	121	case unicode_gc_Zs:
	122	case unicode_gc_Pe:
	123	case unicode_gc_Ps:
	124	separator:
	125	if(in_word) {
	126	dynstr_terminate(&d);
	127	vector_append(&v, d.vec);
	128	in_word = 0;
	129	}
	130	break;
	131
	132	case unicode_gc_Mc:
	133	case unicode_gc_Me:
	134	case unicode_gc_Mn:
	135	case unicode_gc_Pc:
	136	case unicode_gc_Pd:
	137	case unicode_gc_Pf:
	138	case unicode_gc_Pi:
	139	case unicode_gc_Po:
	140	case unicode_gc_Cn:
	141	/* control and punctuation is completely ignored */
	142	break;
	143
	144	}
	145	}
	146	if(in_word) {
	147	/* pick up the final word */
	148	dynstr_terminate(&d);
	149	vector_append(&v, d.vec);
	150	}
	151	vector_terminate(&v);
	152	if(nvecp)
	153	*nvecp = v.nvec;
	154	return v.vec;
	155	}
	156
	157	/*
	158	Local Variables:
	159	c-basic-offset:2
	160	comment-column:40
	161	End:
	162	*/