[disorder] / lib / words.c

/*
 * This file is part of DisOrder
 * Copyright (C) 2004 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

#include <config.h>
#include "types.h"

#include <string.h>
#include <stddef.h>

#include "mem.h"
#include "vector.h"
#include "table.h"
#include "words.h"
#include "utf8.h"

#include "casefold.h"
#include "unicodegc.h"

const char *casefold(const char *ptr) {
  struct dynstr d;
  int l, r, m;
  uint32_t c;
  const struct cm *t;
  const char *start, *s = ptr;

  dynstr_init(&d);
  while(*s) {
    start = s;
    PARSE_UTF8(s, c, return ptr);
    /* seek the folded equivalent */
    t = cm[c & CM_MASK];
    l = 0;
    r = cmn[c & CM_MASK] - 1;
    while(l <= r && c != t[m = (l + r) / 2].ch)
      if(c < t[m].ch)
	r = m - 1;
      else
	l = m + 1;
    if(l <= r)
      dynstr_append_string(&d, t[m].tr);
    else
      dynstr_append_bytes(&d, start, s - start);
  }
  dynstr_terminate(&d);
  return d.vec;
}

static enum unicode_gc_cat cat(uint32_t c) {
  int l, r, m;

  l = 0;
  r = sizeof gcs / sizeof *gcs;
  while(l <= r) {
    m = (l + r) / 2;
    if(c < gcs[m].l)
      r = m - 1;
    else if(c > gcs[m].h)
      l = m + 1;
    else
      return gcs[m].cat;
  }
  return unicode_gc_none;
}

/* XXX this is a bit kludgy */

char **words(const char *s, int *nvecp) {
  struct vector v;
  struct dynstr d;
  const char *start;
  uint32_t c;
  int in_word = 0;

  vector_init(&v);
  while(*s) {
    start = s;
    PARSE_UTF8(s, c, return 0);
    /* special cases first */
    switch(c) {
    case '/':
    case '.':
    case '+':
    case '&':
    case ':':
    case '_':
    case '-':
      goto separator;
    }
    /* do the rest on category */
    switch(cat(c)) {
    case unicode_gc_Ll:
    case unicode_gc_Lm:
    case unicode_gc_Lo:
    case unicode_gc_Lt:
    case unicode_gc_Lu:
    case unicode_gc_Nd:
    case unicode_gc_Nl:
    case unicode_gc_No:
    case unicode_gc_Sc:
    case unicode_gc_Sk:
    case unicode_gc_Sm:
    case unicode_gc_So:
      /* letters, digits and symbols are considered to be part of
       * words */
      if(!in_word) {
	dynstr_init(&d);
	in_word = 1;
      }
      dynstr_append_bytes(&d, start, s - start);
      break;

    case unicode_gc_Cc:
    case unicode_gc_Cf:
    case unicode_gc_Co:
    case unicode_gc_Cs:
    case unicode_gc_Zl:
    case unicode_gc_Zp:
    case unicode_gc_Zs:
    case unicode_gc_Pe:
    case unicode_gc_Ps:
    separator:
      if(in_word) {
	dynstr_terminate(&d);
	vector_append(&v, d.vec);
	in_word = 0;
      }
      break;

    case unicode_gc_Mc:
    case unicode_gc_Me:
    case unicode_gc_Mn:
    case unicode_gc_Pc:
    case unicode_gc_Pd:
    case unicode_gc_Pf:
    case unicode_gc_Pi:
    case unicode_gc_Po:
    case unicode_gc_none:
      /* control and punctuation is completely ignored */
      break;

    }
  }
  if(in_word) {
    /* pick up the final word */
    dynstr_terminate(&d);
    vector_append(&v, d.vec);
  }
  vector_terminate(&v);
  if(nvecp)
    *nvecp = v.nvec;
  return v.vec;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
	2	* This file is part of DisOrder
	3	* Copyright (C) 2004 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20
	21	#include <config.h>
	22	#include "types.h"
	23
	24	#include <string.h>
	25	#include <stddef.h>
	26
	27	#include "mem.h"
	28	#include "vector.h"
	29	#include "table.h"
	30	#include "words.h"
	31	#include "utf8.h"
	32
	33	#include "casefold.h"
	34	#include "unicodegc.h"
	35
	36	const char casefold(const char ptr) {
	37	struct dynstr d;
	38	int l, r, m;
	39	uint32_t c;
	40	const struct cm *t;
	41	const char start, s = ptr;
	42
	43	dynstr_init(&d);
	44	while(*s) {
	45	start = s;
	46	PARSE_UTF8(s, c, return ptr);
	47	/* seek the folded equivalent */
	48	t = cm[c & CM_MASK];
	49	l = 0;
	50	r = cmn[c & CM_MASK] - 1;
	51	while(l <= r && c != t[m = (l + r) / 2].ch)
	52	if(c < t[m].ch)
	53	r = m - 1;
	54	else
	55	l = m + 1;
	56	if(l <= r)
	57	dynstr_append_string(&d, t[m].tr);
	58	else
	59	dynstr_append_bytes(&d, start, s - start);
	60	}
	61	dynstr_terminate(&d);
	62	return d.vec;
	63	}
	64
65	static enum unicode_gc_cat cat(uint32_t c) {
66	int l, r, m;
67
68	l = 0;
69	r = sizeof gcs / sizeof *gcs;
70	while(l <= r) {
71	m = (l + r) / 2;
72	if(c < gcs[m].l)
73	r = m - 1;
74	else if(c > gcs[m].h)
75	l = m + 1;
76	else
77	return gcs[m].cat;
78	}
79	return unicode_gc_none;
80	}
81
82	/* XXX this is a bit kludgy */
83
84	char *words(const char s, int *nvecp) {
85	struct vector v;
86	struct dynstr d;
87	const char *start;
88	uint32_t c;
89	int in_word = 0;
90
91	vector_init(&v);
92	while(*s) {
93	start = s;
94	PARSE_UTF8(s, c, return 0);
95	/* special cases first */
96	switch(c) {
97	case '/':
98	case '.':
99	case '+':
100	case '&':
101	case ':':
102	case '_':
103	case '-':
104	goto separator;
105	}
106	/* do the rest on category */
107	switch(cat(c)) {
108	case unicode_gc_Ll:
109	case unicode_gc_Lm:
110	case unicode_gc_Lo:
111	case unicode_gc_Lt:
112	case unicode_gc_Lu:
113	case unicode_gc_Nd:
114	case unicode_gc_Nl:
115	case unicode_gc_No:
116	case unicode_gc_Sc:
117	case unicode_gc_Sk:
118	case unicode_gc_Sm:
119	case unicode_gc_So:
120	/* letters, digits and symbols are considered to be part of
121	* words */
122	if(!in_word) {
123	dynstr_init(&d);
124	in_word = 1;
125	}
126	dynstr_append_bytes(&d, start, s - start);
127	break;
128
129	case unicode_gc_Cc:
130	case unicode_gc_Cf:
131	case unicode_gc_Co:
132	case unicode_gc_Cs:
133	case unicode_gc_Zl:
134	case unicode_gc_Zp:
135	case unicode_gc_Zs:
136	case unicode_gc_Pe:
137	case unicode_gc_Ps:
138	separator:
139	if(in_word) {
140	dynstr_terminate(&d);
141	vector_append(&v, d.vec);
142	in_word = 0;
143	}
144	break;
145
146	case unicode_gc_Mc:
147	case unicode_gc_Me:
148	case unicode_gc_Mn:
149	case unicode_gc_Pc:
150	case unicode_gc_Pd:
151	case unicode_gc_Pf:
152	case unicode_gc_Pi:
153	case unicode_gc_Po:
154	case unicode_gc_none:
155	/* control and punctuation is completely ignored */
156	break;
157
158	}
159	}
160	if(in_word) {
161	/* pick up the final word */
162	dynstr_terminate(&d);
163	vector_append(&v, d.vec);
164	}
165	vector_terminate(&v);
166	if(nvecp)
167	*nvecp = v.nvec;
168	return v.vec;
169	}
170
171	/*
172	Local Variables:
173	c-basic-offset:2
174	comment-column:40
175	End:
176	*/