[disorder] / lib / words.c

/*
 * This file is part of DisOrder
 * Copyright (C) 2004, 2007 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

#include <config.h>
#include "types.h"

#include <string.h>
#include <stddef.h>

#include "mem.h"
#include "vector.h"
#include "table.h"
#include "words.h"
#include "utf8.h"
#include "log.h"
#include "charset.h"

#include "unidata.h"

const char *casefold(const char *ptr) {
  struct dynstr d;
  uint32_t c;
  const char *s = ptr;

  dynstr_init(&d);
  while(*s) {
    /* Convert UTF-8 to UCS-32 */
    PARSE_UTF8(s, c, return ptr);
    /* Normalize */
    if(c < UNICODE_NCHARS) {
      /* If this a known character, convert it to lower case */
      const struct unidata *const ud = &unidata[c / 256][c % 256];
      c += ud->lower_offset;
    }
    /* Convert UCS-4 back to UTF-8 */
    one_ucs42utf8(c, &d);
  }
  dynstr_terminate(&d);
  return d.vec;
}

static enum unicode_gc_cat cat(uint32_t c) {
  if(c < UNICODE_NCHARS) {
    /* If this a known character, convert it to lower case */
    const struct unidata *const ud = &unidata[c / 256][c % 256];
    return ud->gc;
  } else
    return unicode_gc_Cn;
}

/* XXX this is a bit kludgy */

char **words(const char *s, int *nvecp) {
  struct vector v;
  struct dynstr d;
  const char *start;
  uint32_t c;
  int in_word = 0;

  vector_init(&v);
  while(*s) {
    start = s;
    PARSE_UTF8(s, c, return 0);
    /* special cases first */
    switch(c) {
    case '/':
    case '.':
    case '+':
    case '&':
    case ':':
    case '_':
    case '-':
      goto separator;
    }
    /* do the rest on category */
    switch(cat(c)) {
    case unicode_gc_Ll:
    case unicode_gc_Lm:
    case unicode_gc_Lo:
    case unicode_gc_Lt:
    case unicode_gc_Lu:
    case unicode_gc_Nd:
    case unicode_gc_Nl:
    case unicode_gc_No:
    case unicode_gc_Sc:
    case unicode_gc_Sk:
    case unicode_gc_Sm:
    case unicode_gc_So:
      /* letters, digits and symbols are considered to be part of
       * words */
      if(!in_word) {
	dynstr_init(&d);
	in_word = 1;
      }
      dynstr_append_bytes(&d, start, s - start);
      break;

    case unicode_gc_Cc:
    case unicode_gc_Cf:
    case unicode_gc_Co:
    case unicode_gc_Cs:
    case unicode_gc_Zl:
    case unicode_gc_Zp:
    case unicode_gc_Zs:
    case unicode_gc_Pe:
    case unicode_gc_Ps:
    separator:
      if(in_word) {
	dynstr_terminate(&d);
	vector_append(&v, d.vec);
	in_word = 0;
      }
      break;

    case unicode_gc_Mc:
    case unicode_gc_Me:
    case unicode_gc_Mn:
    case unicode_gc_Pc:
    case unicode_gc_Pd:
    case unicode_gc_Pf:
    case unicode_gc_Pi:
    case unicode_gc_Po:
    case unicode_gc_Cn:
      /* control and punctuation is completely ignored */
      break;

    }
  }
  if(in_word) {
    /* pick up the final word */
    dynstr_terminate(&d);
    vector_append(&v, d.vec);
  }
  vector_terminate(&v);
  if(nvecp)
    *nvecp = v.nvec;
  return v.vec;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
460b9539	2	* This file is part of DisOrder
61507e3c	3	* Copyright (C) 2004, 2007 Richard Kettlewell
460b9539	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20
	21	#include <config.h>
	22	#include "types.h"
	23
	24	#include <string.h>
	25	#include <stddef.h>
	26
	27	#include "mem.h"
	28	#include "vector.h"
	29	#include "table.h"
	30	#include "words.h"
	31	#include "utf8.h"
61507e3c RK	32	#include "log.h"
61507e3c RK	33	#include "charset.h"
460b9539	34
61507e3c	35	#include "unidata.h"
460b9539	36
	37	const char casefold(const char ptr) {
	38	struct dynstr d;
460b9539	39	uint32_t c;
61507e3c	40	const char *s = ptr;
460b9539	41
	42	dynstr_init(&d);
	43	while(*s) {
61507e3c	44	/* Convert UTF-8 to UCS-32 */
460b9539	45	PARSE_UTF8(s, c, return ptr);
61507e3c RK	46	/* Normalize */
	47	if(c < UNICODE_NCHARS) {
	48	/* If this a known character, convert it to lower case */
	49	const struct unidata *const ud = &unidata[c / 256][c % 256];
	50	c += ud->lower_offset;
	51	}
	52	/* Convert UCS-4 back to UTF-8 */
	53	one_ucs42utf8(c, &d);
460b9539	54	}
	55	dynstr_terminate(&d);
	56	return d.vec;
	57	}
	58
	59	static enum unicode_gc_cat cat(uint32_t c) {
61507e3c RK	60	if(c < UNICODE_NCHARS) {
	61	/* If this a known character, convert it to lower case */
	62	const struct unidata *const ud = &unidata[c / 256][c % 256];
	63	return ud->gc;
	64	} else
	65	return unicode_gc_Cn;
460b9539	66	}
	67
	68	/* XXX this is a bit kludgy */
	69
	70	char *words(const char s, int *nvecp) {
	71	struct vector v;
	72	struct dynstr d;
	73	const char *start;
	74	uint32_t c;
	75	int in_word = 0;
	76
	77	vector_init(&v);
	78	while(*s) {
	79	start = s;
	80	PARSE_UTF8(s, c, return 0);
	81	/* special cases first */
	82	switch(c) {
	83	case '/':
	84	case '.':
	85	case '+':
	86	case '&':
	87	case ':':
	88	case '_':
	89	case '-':
	90	goto separator;
	91	}
	92	/* do the rest on category */
	93	switch(cat(c)) {
	94	case unicode_gc_Ll:
	95	case unicode_gc_Lm:
	96	case unicode_gc_Lo:
	97	case unicode_gc_Lt:
	98	case unicode_gc_Lu:
	99	case unicode_gc_Nd:
	100	case unicode_gc_Nl:
	101	case unicode_gc_No:
	102	case unicode_gc_Sc:
	103	case unicode_gc_Sk:
	104	case unicode_gc_Sm:
	105	case unicode_gc_So:
	106	/* letters, digits and symbols are considered to be part of
	107	* words */
	108	if(!in_word) {
	109	dynstr_init(&d);
	110	in_word = 1;
	111	}
	112	dynstr_append_bytes(&d, start, s - start);
	113	break;
	114
	115	case unicode_gc_Cc:
	116	case unicode_gc_Cf:
	117	case unicode_gc_Co:
	118	case unicode_gc_Cs:
	119	case unicode_gc_Zl:
	120	case unicode_gc_Zp:
	121	case unicode_gc_Zs:
	122	case unicode_gc_Pe:
	123	case unicode_gc_Ps:
	124	separator:
	125	if(in_word) {
	126	dynstr_terminate(&d);
	127	vector_append(&v, d.vec);
	128	in_word = 0;
	129	}
130	break;
131
132	case unicode_gc_Mc:
133	case unicode_gc_Me:
134	case unicode_gc_Mn:
135	case unicode_gc_Pc:
136	case unicode_gc_Pd:
137	case unicode_gc_Pf:
138	case unicode_gc_Pi:
139	case unicode_gc_Po:
61507e3c	140	case unicode_gc_Cn:
460b9539	141	/* control and punctuation is completely ignored */
	142	break;
	143
	144	}
	145	}
	146	if(in_word) {
	147	/* pick up the final word */
	148	dynstr_terminate(&d);
	149	vector_append(&v, d.vec);
	150	}
	151	vector_terminate(&v);
	152	if(nvecp)
	153	*nvecp = v.nvec;
	154	return v.vec;
	155	}
	156
	157	/*
	158	Local Variables:
	159	c-basic-offset:2
	160	comment-column:40
	161	End:
	162	*/