Removed previous spellcheck implementation
This commit is contained in:
parent
9ef9fd5dea
commit
640bf19139
|
@ -1,283 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2005-2007 Gerald Schmidt.
|
|
||||||
*
|
|
||||||
* This file is part of Xml Copy Editor.
|
|
||||||
*
|
|
||||||
* Xml Copy Editor is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; version 2 of the License.
|
|
||||||
*
|
|
||||||
* Xml Copy Editor is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with Xml Copy Editor; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "spellcheck.h"
|
|
||||||
#include "casehandler.h"
|
|
||||||
#include "contexthandler.h"
|
|
||||||
#include "stringset.h"
|
|
||||||
|
|
||||||
Spellcheck::Spellcheck (
|
|
||||||
boost::shared_ptr<StringSet<char> > dictionaryParameter,
|
|
||||||
boost::shared_ptr<StringSet<char> > passiveDictionaryParameter ) :
|
|
||||||
dictionary ( dictionaryParameter ),
|
|
||||||
passiveDictionary ( passiveDictionaryParameter )
|
|
||||||
{}
|
|
||||||
|
|
||||||
Spellcheck::~Spellcheck()
|
|
||||||
{ }
|
|
||||||
|
|
||||||
bool Spellcheck::checkWord ( string &s )
|
|
||||||
{
|
|
||||||
return checkWord ( ( char * ) s.c_str(), s.size() );
|
|
||||||
}
|
|
||||||
|
|
||||||
string Spellcheck::getSuggestion (
|
|
||||||
string &s )
|
|
||||||
{
|
|
||||||
string suggestion;
|
|
||||||
size_t len = s.size();
|
|
||||||
|
|
||||||
// transpose
|
|
||||||
if ( len > 1 )
|
|
||||||
{
|
|
||||||
for ( size_t ui = 0; ui < len - 1; ++ui )
|
|
||||||
{
|
|
||||||
if ( ( unsigned char ) s[ui] > 127 )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
suggestion = s;
|
|
||||||
char c = suggestion[ui];
|
|
||||||
suggestion[ui] = suggestion[ui + 1];
|
|
||||||
suggestion[ui + 1] = c;
|
|
||||||
if (
|
|
||||||
checkWord ( suggestion ) &&
|
|
||||||
passiveDictionary->find ( suggestion ) == NULL )
|
|
||||||
{
|
|
||||||
CaseHandler::adjustCase ( suggestion, s );
|
|
||||||
return suggestion;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// change one letter
|
|
||||||
for ( size_t ui = 1; ui < len; ++ui )
|
|
||||||
{
|
|
||||||
if ( ( unsigned char ) s[ui] > 127 )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for ( char c = 'a'; c <= 'z'; ++c )
|
|
||||||
{
|
|
||||||
suggestion = s;
|
|
||||||
suggestion[ui] = c;
|
|
||||||
if (
|
|
||||||
checkWord ( suggestion ) &&
|
|
||||||
passiveDictionary->find ( suggestion ) == NULL )
|
|
||||||
{
|
|
||||||
CaseHandler::adjustCase ( suggestion, s );
|
|
||||||
return suggestion;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// two words
|
|
||||||
if ( len > 2 )
|
|
||||||
{
|
|
||||||
for ( size_t ui = 1; ui < len; ++ui )
|
|
||||||
{
|
|
||||||
char *ptr = ( char * ) s.c_str();
|
|
||||||
if (
|
|
||||||
checkWord ( ptr, ui ) &&
|
|
||||||
checkWord ( ptr + ui, len - ui )
|
|
||||||
)
|
|
||||||
{
|
|
||||||
suggestion = s;
|
|
||||||
suggestion.insert ( ui, 1, ' ' );
|
|
||||||
return suggestion;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// add one letter
|
|
||||||
for ( size_t ui = 1; ui < len; ++ui )
|
|
||||||
{
|
|
||||||
if ( ( unsigned char ) s[ui] > 127 )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for ( char c = 'a'; c <= 'z'; ++c )
|
|
||||||
{
|
|
||||||
suggestion = s;
|
|
||||||
suggestion.insert ( ui, 1, c );
|
|
||||||
if (
|
|
||||||
checkWord ( suggestion ) &&
|
|
||||||
passiveDictionary->find ( suggestion ) == NULL )
|
|
||||||
{
|
|
||||||
CaseHandler::adjustCase ( suggestion, s );
|
|
||||||
return suggestion;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// omit one letter
|
|
||||||
if ( len > 2 )
|
|
||||||
{
|
|
||||||
for ( size_t ui = 0; ui < len; ++ui )
|
|
||||||
{
|
|
||||||
// ignore UTF-8 multibyte sequences
|
|
||||||
if ( ( unsigned char ) s[ui] > 127 )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
suggestion = s;
|
|
||||||
suggestion.erase ( ui, 1 );
|
|
||||||
if (
|
|
||||||
checkWord ( suggestion ) &&
|
|
||||||
passiveDictionary->find ( suggestion ) == NULL )
|
|
||||||
{
|
|
||||||
|
|
||||||
CaseHandler::adjustCase ( suggestion, s );
|
|
||||||
return suggestion;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return "---";
|
|
||||||
}
|
|
||||||
|
|
||||||
void Spellcheck::checkString (
|
|
||||||
string &s,
|
|
||||||
vector<ContextMatch> &v,
|
|
||||||
int contextRange )
|
|
||||||
{
|
|
||||||
string suggestion;
|
|
||||||
size_t len;
|
|
||||||
char *origin, *iterator, *ptr;
|
|
||||||
origin = iterator = ( char * ) s.c_str();
|
|
||||||
while ( ( ptr = getWord ( &iterator, &len ) ) != NULL )
|
|
||||||
if ( !checkWord ( ptr, len ) )
|
|
||||||
{
|
|
||||||
ContextMatch m = ContextHandler::getContext (
|
|
||||||
ptr,
|
|
||||||
len,
|
|
||||||
origin,
|
|
||||||
contextRange );
|
|
||||||
|
|
||||||
// handle suggestion
|
|
||||||
suggestion = getSuggestion ( m.match );
|
|
||||||
m.replace.append ( suggestion );
|
|
||||||
m.elementCount = 0;
|
|
||||||
m.offset = ptr - origin;
|
|
||||||
v.push_back ( m );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Spellcheck::checkWord ( char *s, size_t len )
|
|
||||||
{
|
|
||||||
// pass empty strings and single-character words
|
|
||||||
if ( len < 2 )
|
|
||||||
return true;
|
|
||||||
|
|
||||||
string buffer;
|
|
||||||
buffer.append ( s, len );
|
|
||||||
if ( dictionary->find ( buffer ) != NULL )
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// lower-case with capital initial
|
|
||||||
string::iterator it = buffer.begin();
|
|
||||||
++it;
|
|
||||||
for ( ; it != buffer.end(); it++ )
|
|
||||||
*it = tolower ( *it );
|
|
||||||
|
|
||||||
if ( dictionary->find ( buffer ) != NULL )
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// lower-case throughout
|
|
||||||
it = buffer.begin();
|
|
||||||
*it = tolower ( *it );
|
|
||||||
|
|
||||||
if ( dictionary->find ( buffer ) != NULL )
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Spellcheck::isWordCharacter ( char *s, size_t *bytes )
|
|
||||||
{
|
|
||||||
*bytes = 0;
|
|
||||||
|
|
||||||
unsigned char *us = ( unsigned char * ) s;
|
|
||||||
if (
|
|
||||||
*us < 65 ||
|
|
||||||
( *us > 90 && *us < 97 ) ||
|
|
||||||
( *us > 123 && *us < 128 )
|
|
||||||
)
|
|
||||||
{
|
|
||||||
*bytes = 1;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// check for UTF-8 byte sequences
|
|
||||||
else if ( *us > 127 )
|
|
||||||
{
|
|
||||||
// determine length
|
|
||||||
unsigned char *it = us;
|
|
||||||
for ( ; *it > 127; ++it )
|
|
||||||
++ ( *bytes );
|
|
||||||
|
|
||||||
// Unicode punctuation marks
|
|
||||||
// Based on http://www1.tip.nl/~t876506/utf8tbl.html
|
|
||||||
return (
|
|
||||||
*us == 226 && * ( us + 1 ) == 128 ||
|
|
||||||
*us == 194 ||
|
|
||||||
*us == 203
|
|
||||||
) ? false : true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
*bytes = 1;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
char *Spellcheck::getWord ( char **s, size_t *len )
|
|
||||||
{
|
|
||||||
size_t bytes;
|
|
||||||
char *t, *u;
|
|
||||||
int offset;
|
|
||||||
|
|
||||||
t = *s;
|
|
||||||
|
|
||||||
while ( *t )
|
|
||||||
{
|
|
||||||
if ( isWordCharacter ( t, &bytes ) )
|
|
||||||
{
|
|
||||||
for ( u = t; *u; )
|
|
||||||
{
|
|
||||||
if ( !isWordCharacter ( u, &bytes ) )
|
|
||||||
{
|
|
||||||
*len = u - t;
|
|
||||||
offset = t - *s;
|
|
||||||
*s += *len + offset + bytes;
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
u += bytes;
|
|
||||||
}
|
|
||||||
if ( *t && ! ( *u ) )
|
|
||||||
{
|
|
||||||
*len = u - t;
|
|
||||||
offset = t - *s;
|
|
||||||
*s = u;
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
t = u;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
t += bytes;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
|
@ -1,51 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2005-2007 Gerald Schmidt.
|
|
||||||
*
|
|
||||||
* This file is part of Xml Copy Editor.
|
|
||||||
*
|
|
||||||
* Xml Copy Editor is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; version 2 of the License.
|
|
||||||
*
|
|
||||||
* Xml Copy Editor is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with Xml Copy Editor; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef SPELLCHECK_H
|
|
||||||
#define SPELLCHECK_H
|
|
||||||
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <boost/shared_ptr.hpp>
|
|
||||||
#include "contexthandler.h"
|
|
||||||
#include "stringset.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class Spellcheck
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
Spellcheck (
|
|
||||||
boost::shared_ptr<StringSet<char> > dictionaryParameter,
|
|
||||||
boost::shared_ptr<StringSet<char> > passiveDictionaryParameter );
|
|
||||||
~Spellcheck();
|
|
||||||
inline bool checkWord ( string &s );
|
|
||||||
void checkString (
|
|
||||||
string &s,
|
|
||||||
vector<ContextMatch> &v,
|
|
||||||
int contextRange );
|
|
||||||
string getSuggestion ( string &s );
|
|
||||||
private:
|
|
||||||
boost::shared_ptr<StringSet<char> > dictionary, passiveDictionary;
|
|
||||||
bool checkWord ( char *s, size_t len );
|
|
||||||
char *getWord ( char **s, size_t *len );
|
|
||||||
inline bool isWordCharacter ( char *s, size_t *bytes );
|
|
||||||
};
|
|
||||||
#endif
|
|
Loading…
Reference in New Issue