xml-copy-editor-code/src/wrapregex.cpp

249 lines
6.3 KiB
C++
Raw Normal View History

2007-09-07 23:17:30 +02:00
#include <iostream>
#include <string>
#include <ctype.h>
#include <stdexcept>
#include "wrapregex.h"
#include "contexthandler.h"
using namespace std;
2007-09-08 00:25:30 +02:00
WrapRegex::WrapRegex (
const string& pattern,
bool matchCase,
const string& replaceParameter,
const int arrayLengthParameter ) :
replace ( replaceParameter ),
arrayLength ( arrayLengthParameter )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
if ( pattern.empty() || pattern == ".*" )
{
disabled = true;
return;
}
disabled = false;
matchArray = new int[arrayLength];
// compile
int optionsFlag = ( matchCase ) ? PCRE_UTF8 : PCRE_CASELESS | PCRE_UTF8;
const char *errorPointer;
int errorOffset;
if ( ( patternStructure = pcre_compile (
pattern.c_str(),
optionsFlag,
&errorPointer,
&errorOffset,
NULL ) ) == NULL )
{
throw runtime_error ( errorPointer );
}
patternExtraStructure = pcre_study ( patternStructure, 0, &errorPointer );
2007-09-07 23:17:30 +02:00
}
WrapRegex::~WrapRegex()
{
2007-09-08 00:25:30 +02:00
if ( disabled )
return;
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
try
{
pcre_free ( patternStructure );
pcre_free ( patternExtraStructure );
delete[] matchArray;
}
catch ( ... )
{
throw runtime_error ( "Wrapregex::~WrapRegex" );
}
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
int WrapRegex::matchPatternGlobal (
string &buffer,
vector<ContextMatch> &matchVector,
unsigned elementCount,
int context )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
if ( disabled )
return 0;
return matchPatternGlobal_ (
buffer.c_str(),
buffer.size(),
matchVector,
elementCount,
context );
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
string WrapRegex::replaceGlobal (
const string& buffer,
int *matchCount )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
*matchCount = 0;
if ( disabled )
return buffer;
char *s = ( char * ) buffer.c_str();
string output, match;
output.reserve ( buffer.size() );
while ( ( returnValue = pcre_exec (
patternStructure,
patternExtraStructure,
s,
strlen ( s ),
0,
0,
matchArray,
arrayLength ) ) >= 0 )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
++ ( *matchCount );
output.append ( s, matchArray[0] );
match.clear();
match.append ( s + matchArray[0], matchArray[1] - matchArray[0] );
output.append ( getInterpolatedString_ ( s, ( char * ) replace.c_str() ) );
s += matchArray[1];
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
output.append ( s );
return output;
}
int WrapRegex::matchPatternGlobal_ (
const char *buffer,
size_t buflen,
vector<ContextMatch> &matchVector,
unsigned elementCount,
int context )
{
if ( disabled )
return 0;
char *s, *origin;
int matchcount;
size_t offset;
ContextMatch match;
s = origin = ( char * ) buffer;
matchcount = 0;
offset = 0;
while ( ( returnValue = pcre_exec (
patternStructure,
patternExtraStructure,
s,
buflen,
offset,
0,
matchArray,
arrayLength ) ) >= 0 )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
++matchcount;
if ( context )
{
match = ContextHandler::getContext (
s + matchArray[0],
matchArray[1] - matchArray[0],
origin,
context );
}
else
{
match.prelog = match.postlog = "";
match.match.assign ( s + matchArray[0], matchArray[1] - matchArray[0] );
}
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
// record element and offset information
match.elementCount = elementCount;
match.offset = matchArray[0];
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
if ( replace != "" )
match.replace = getInterpolatedString_ ( s, ( char * ) replace.c_str() );
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
matchVector.push_back ( match );
if ( ( offset = matchArray[1] ) >= buflen )
break;
}
return matchcount;
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
string WrapRegex::getInterpolatedString_ ( char *buffer, char *source )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
if ( disabled )
return "";
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
char *s, *origin;
s = origin = ( char * ) source;
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
string interpol_string;
2007-09-07 23:17:30 +02:00
2007-09-08 00:25:30 +02:00
int escapeState = false;
for ( ; *s; ++s )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
if ( *s == '\\' )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
escapeState = ( escapeState ) ? false : true;
if ( escapeState )
{
if ( isdigit ( * ( s + 1 ) ) )
{
char *number, *it;
number = s + 1;
for ( it = number; *it && isdigit ( * ( it + 1 ) ); ++it )
;
size_t len = it - s;
char *tmp = new char[len + 1];
memcpy ( tmp, number, sizeof ( char ) * len );
* ( tmp + len ) = '\0';
int i = atoi ( tmp );
delete[] tmp;
interpol_string += getSubpattern_ ( buffer, i );
s += len;
escapeState = false;
}
else if ( * ( s + 1 ) == 't' )
{
interpol_string += '\t';
++s;
escapeState = false;
}
else if ( * ( s + 1 ) == 'n' )
{
interpol_string += '\n';
++s;
escapeState = false;
}
else
interpol_string += *s;
}
else
interpol_string += *s;
2007-09-07 23:17:30 +02:00
}
else
2007-09-08 00:25:30 +02:00
interpol_string += *s;
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
return interpol_string;
2007-09-07 23:17:30 +02:00
}
2007-09-08 00:25:30 +02:00
string WrapRegex::getSubpattern_ ( char *s, unsigned subpattern )
2007-09-07 23:17:30 +02:00
{
2007-09-08 00:25:30 +02:00
if ( disabled )
return "";
const char *sub;
int ret = pcre_get_substring ( s, matchArray, returnValue, subpattern, &sub );
if ( ret == PCRE_ERROR_NOSUBSTRING || ret == PCRE_ERROR_NOMEMORY )
return "";
string subString ( sub );
pcre_free ( ( char * ) sub );
return subString;
2007-09-07 23:17:30 +02:00
}