#include "getword.h" char *GetWord::run(char **s, size_t *len) { size_t bytes; char *t, *u; int offset; t = *s; while (*t) { if (GetWord::isWordCharacter(t, &bytes)) { for (u = t; *u;) { if (!GetWord::isWordCharacter(u, &bytes)) { *len = u - t; offset = t - *s; *s += *len + offset + bytes; return t; } else u += bytes; } if (*t && !(*u)) { *len = u - t; offset = t - *s; *s = u; return t; } t = u; } else t += bytes; } return NULL; } bool GetWord::isWordCharacter(char *s, size_t *bytes) { *bytes = 0; unsigned char *us = (unsigned char *)s; if ( *us < 65 || (*us > 90 && *us < 97) || (*us > 123 && *us < 128) ) { *bytes = 1; return false; } // check for UTF-8 byte sequences else if (*us > 127) { // determine length unsigned char *it = us; for (; *it > 127; ++it) ++(*bytes); // Unicode punctuation marks // Based on http://www1.tip.nl/~t876506/utf8tbl.html return ( *us == 226 && *(us + 1) == 128 || *us == 194 || *us == 203 ) ? false : true; } else { *bytes = 1; return true; } }