[disorder] / lib / words.c

/*
 * This file is part of DisOrder
 * Copyright (C) 2004, 2007 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

#include <config.h>
#include "types.h"

#include <string.h>
#include <stddef.h>

#include "mem.h"
#include "vector.h"
#include "table.h"
#include "words.h"
#include "utf8.h"
#include "log.h"
#include "charset.h"

#include "unidata.h"
#include "unicode.h"

const char *casefold(const char *ptr) {
  return utf8_casefold_canon(ptr, strlen(ptr), 0);
}

static enum unicode_gc_cat cat(uint32_t c) {
  if(c < UNICODE_NCHARS) {
    /* If this a known character, convert it to lower case */
    const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
    return ud->gc;
  } else
    return unicode_gc_Cn;
}

/* XXX this is a bit kludgy */

char **words(const char *s, int *nvecp) {
  struct vector v;
  struct dynstr d;
  const char *start;
  uint32_t c;
  int in_word = 0;

  vector_init(&v);
  while(*s) {
    start = s;
    PARSE_UTF8(s, c, return 0);
    /* special cases first */
    switch(c) {
    case '/':
    case '.':
    case '+':
    case '&':
    case ':':
    case '_':
    case '-':
      goto separator;
    }
    /* do the rest on category */
    switch(cat(c)) {
    case unicode_gc_Ll:
    case unicode_gc_Lm:
    case unicode_gc_Lo:
    case unicode_gc_Lt:
    case unicode_gc_Lu:
    case unicode_gc_Nd:
    case unicode_gc_Nl:
    case unicode_gc_No:
    case unicode_gc_Sc:
    case unicode_gc_Sk:
    case unicode_gc_Sm:
    case unicode_gc_So:
      /* letters, digits and symbols are considered to be part of
       * words */
      if(!in_word) {
	dynstr_init(&d);
	in_word = 1;
      }
      dynstr_append_bytes(&d, start, s - start);
      break;

    case unicode_gc_Cc:
    case unicode_gc_Cf:
    case unicode_gc_Co:
    case unicode_gc_Cs:
    case unicode_gc_Zl:
    case unicode_gc_Zp:
    case unicode_gc_Zs:
    case unicode_gc_Pe:
    case unicode_gc_Ps:
    separator:
      if(in_word) {
	dynstr_terminate(&d);
	vector_append(&v, d.vec);
	in_word = 0;
      }
      break;

    case unicode_gc_Mc:
    case unicode_gc_Me:
    case unicode_gc_Mn:
    case unicode_gc_Pc:
    case unicode_gc_Pd:
    case unicode_gc_Pf:
    case unicode_gc_Pi:
    case unicode_gc_Po:
    case unicode_gc_Cn:
      /* control and punctuation is completely ignored */
      break;

    }
  }
  if(in_word) {
    /* pick up the final word */
    dynstr_terminate(&d);
    vector_append(&v, d.vec);
  }
  vector_terminate(&v);
  if(nvecp)
    *nvecp = v.nvec;
  return v.vec;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
460b9539	2	* This file is part of DisOrder
61507e3c	3	* Copyright (C) 2004, 2007 Richard Kettlewell
460b9539	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20
	21	#include <config.h>
	22	#include "types.h"
	23
	24	#include <string.h>
	25	#include <stddef.h>
	26
	27	#include "mem.h"
	28	#include "vector.h"
	29	#include "table.h"
	30	#include "words.h"
	31	#include "utf8.h"
61507e3c RK	32	#include "log.h"
61507e3c RK	33	#include "charset.h"
460b9539	34
61507e3c	35	#include "unidata.h"
e5a5a138	36	#include "unicode.h"
460b9539	37
460b9539	38	const char casefold(const char ptr) {
e5a5a138	39	return utf8_casefold_canon(ptr, strlen(ptr), 0);
460b9539	40	}
	41
	42	static enum unicode_gc_cat cat(uint32_t c) {
61507e3c RK	43	if(c < UNICODE_NCHARS) {
61507e3c RK	44	/* If this a known character, convert it to lower case */
e5a5a138	45	const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
61507e3c RK	46	return ud->gc;
	47	} else
	48	return unicode_gc_Cn;
460b9539	49	}
	50
	51	/* XXX this is a bit kludgy */
	52
	53	char *words(const char s, int *nvecp) {
	54	struct vector v;
	55	struct dynstr d;
	56	const char *start;
	57	uint32_t c;
	58	int in_word = 0;
	59
	60	vector_init(&v);
	61	while(*s) {
	62	start = s;
	63	PARSE_UTF8(s, c, return 0);
	64	/* special cases first */
	65	switch(c) {
	66	case '/':
	67	case '.':
	68	case '+':
	69	case '&':
	70	case ':':
	71	case '_':
	72	case '-':
	73	goto separator;
	74	}
	75	/* do the rest on category */
	76	switch(cat(c)) {
	77	case unicode_gc_Ll:
	78	case unicode_gc_Lm:
	79	case unicode_gc_Lo:
	80	case unicode_gc_Lt:
	81	case unicode_gc_Lu:
	82	case unicode_gc_Nd:
	83	case unicode_gc_Nl:
	84	case unicode_gc_No:
	85	case unicode_gc_Sc:
	86	case unicode_gc_Sk:
	87	case unicode_gc_Sm:
	88	case unicode_gc_So:
	89	/* letters, digits and symbols are considered to be part of
	90	* words */
	91	if(!in_word) {
	92	dynstr_init(&d);
	93	in_word = 1;
	94	}
	95	dynstr_append_bytes(&d, start, s - start);
	96	break;
	97
	98	case unicode_gc_Cc:
	99	case unicode_gc_Cf:
	100	case unicode_gc_Co:
	101	case unicode_gc_Cs:
	102	case unicode_gc_Zl:
	103	case unicode_gc_Zp:
	104	case unicode_gc_Zs:
	105	case unicode_gc_Pe:
	106	case unicode_gc_Ps:
	107	separator:
	108	if(in_word) {
	109	dynstr_terminate(&d);
	110	vector_append(&v, d.vec);
	111	in_word = 0;
	112	}
113	break;
114
115	case unicode_gc_Mc:
116	case unicode_gc_Me:
117	case unicode_gc_Mn:
118	case unicode_gc_Pc:
119	case unicode_gc_Pd:
120	case unicode_gc_Pf:
121	case unicode_gc_Pi:
122	case unicode_gc_Po:
61507e3c	123	case unicode_gc_Cn:
460b9539	124	/* control and punctuation is completely ignored */
	125	break;
	126
	127	}
	128	}
	129	if(in_word) {
	130	/* pick up the final word */
	131	dynstr_terminate(&d);
	132	vector_append(&v, d.vec);
	133	}
	134	vector_terminate(&v);
	135	if(nvecp)
	136	*nvecp = v.nvec;
	137	return v.vec;
	138	}
	139
	140	/*
	141	Local Variables:
	142	c-basic-offset:2
	143	comment-column:40
	144	End:
	145	*/