xml-copy-editor-code/src/getword.cpp

98 lines
2.4 KiB
C++
Raw Normal View History

/*
* Copyright 2005-2007 Gerald Schmidt.
*
* This file is part of Xml Copy Editor.
*
* Xml Copy Editor is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
* Xml Copy Editor is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Xml Copy Editor; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
2007-09-07 23:17:30 +02:00
#include "getword.h"
2007-09-08 00:25:30 +02:00
char *GetWord::run ( char **s, size_t *len )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
size_t bytes;
char *t, *u;
int offset;
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
t = *s;
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
while ( *t )
{
if ( GetWord::isWordCharacter ( t, &bytes ) )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
for ( u = t; *u; )
{
if ( !GetWord::isWordCharacter ( u, &bytes ) )
{
*len = u - t;
offset = t - *s;
*s += *len + offset + bytes;
return t;
}
else
u += bytes;
}
if ( *t && ! ( *u ) )
{
*len = u - t;
offset = t - *s;
*s = u;
return t;
}
t = u;
2007-09-07 23:17:30 +02:00
}
else
2007-09-08 00:25:30 +02:00
t += bytes;
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
return NULL;
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
bool GetWord::isWordCharacter ( char *s, size_t *bytes )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
*bytes = 0;
unsigned char *us = ( unsigned char * ) s;
if (
*us < 65 ||
( *us > 90 && *us < 97 ) ||
( *us > 123 && *us < 128 )
)
{
*bytes = 1;
return false;
}
// check for UTF-8 byte sequences
else if ( *us > 127 )
{
// determine length
unsigned char *it = us;
for ( ; *it > 127; ++it )
++ ( *bytes );
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
// Unicode punctuation marks
// Based on http://www1.tip.nl/~t876506/utf8tbl.html
return (
*us == 226 && * ( us + 1 ) == 128 ||
*us == 194 ||
*us == 203
) ? false : true;
}
else
{
*bytes = 1;
return true;
}
2007-09-07 23:17:30 +02:00
}