Fixed Unicode problems

This commit is contained in:
Zane U. Ji 2013-10-25 19:23:34 +08:00
parent a9916c901e
commit 5310603467
5 changed files with 95 additions and 65 deletions

View File

@ -60,7 +60,7 @@ WrapXerces::~WrapXerces()
delete catalogResolver; delete catalogResolver;
} }
bool WrapXerces::validate ( const std::string& fileName ) bool WrapXerces::validate ( const wxString& fileName )
{ {
SAX2XMLReader *parser = XMLReaderFactory::createXMLReader(); SAX2XMLReader *parser = XMLReaderFactory::createXMLReader();
@ -84,7 +84,7 @@ bool WrapXerces::validate ( const std::string& fileName )
try try
{ {
parser->parse ( fileName.c_str() ); parser->parse ( (const XMLCh *) toString ( fileName ).GetData() );
} }
catch ( XMLException& e ) catch ( XMLException& e )
{ {
@ -221,3 +221,22 @@ wxString WrapXerces::toString ( const XMLCh *str )
{ {
return wxString ( ( const char * ) str, getMBConv() ); return wxString ( ( const char * ) str, getMBConv() );
} }
wxMemoryBuffer WrapXerces::toString ( const wxString &str )
{
const static XMLCh chNull = '\0'; // Xerces-C crashes when the file name is NULL. We'd better return something other than NULL.
wxMemoryBuffer buffer ( 0 );
const size_t lenWC = str.length() + 1; // Plus '\0'. This is important. Otherwise we can call wxString::mb_str(getMBConv()).
size_t lenMB = getMBConv().FromWChar ( NULL, 0, str.c_str(), lenWC );
if ( lenMB == wxCONV_FAILED )
{
buffer.AppendData ( &chNull, sizeof chNull );
return buffer;
}
buffer.SetBufSize ( lenMB );
lenMB = getMBConv().FromWChar ( ( char * ) buffer.GetData(), lenMB, str.c_str(), lenWC );
buffer.SetDataLen ( lenMB );
return buffer;
}

View File

@ -22,6 +22,7 @@
#include <wx/wx.h> #include <wx/wx.h>
#include <wx/strconv.h> #include <wx/strconv.h>
#include <wx/buffer.h>
#include <string> #include <string>
#include <utility> #include <utility>
@ -41,14 +42,22 @@ class WrapXerces
static void Init() throw (); static void Init() throw ();
WrapXerces(); WrapXerces();
virtual ~WrapXerces(); virtual ~WrapXerces();
bool validate ( const std::string& fileName ); bool validate ( const wxString &fileName );
bool validateMemory ( const char *buffer, size_t len, bool validateMemory ( const char *buffer, size_t len,
const wxString &system, wxThread *thread = NULL ); const wxString &system, wxThread *thread = NULL );
const wxString &getLastError(); const wxString &getLastError();
std::pair<int, int> getErrorPosition(); std::pair<int, int> getErrorPosition();
static const wxMBConv &getMBConv();
static wxString toString ( const XMLCh *str ); static wxString toString ( const XMLCh *str );
// Convert Unicode string to const XMLCh *
//#if wxCHECK_VERSION(2,9,0)
// static wxCharTypeBuffer<XMLCh> toString ( const wxString &str );
//#else
static wxMemoryBuffer toString ( const wxString &str );
//#endif
private: private:
static const wxMBConv &getMBConv();
XercesCatalogResolver *catalogResolver; XercesCatalogResolver *catalogResolver;
wxString lastError; wxString lastError;
std::pair<int, int> errorPosition; std::pair<int, int> errorPosition;

View File

@ -3123,76 +3123,75 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
char *finalBuffer; char *finalBuffer;
size_t finalBufferLen; size_t finalBufferLen;
// adjust for UTF-8 BOM std::string encoding;
if ( docBuffer && if ( docBufferLen >= 4 && // UTF-32 BE
( unsigned char ) docBuffer[0] == 0xEF && ( unsigned char ) docBuffer[0] == 0x00 &&
( unsigned char ) docBuffer[1] == 0xBB && ( unsigned char ) docBuffer[1] == 0x00 &&
( unsigned char ) docBuffer[2] == 0xBF ) ( unsigned char ) docBuffer[2] == 0xFE &&
( unsigned char ) docBuffer[3] == 0xFF )
{
docBuffer += 4;
docBufferLen -= 4;
encoding = "UTF-32BE";
}
else if ( docBufferLen >= 4 && // UTF-32 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE &&
( unsigned char ) docBuffer[2] == 0x00 &&
( unsigned char ) docBuffer[3] == 0x00 )
{
docBuffer += 4;
docBufferLen -= 4;
encoding = "UTF-32LE";
}
else if ( docBufferLen >= 2 && //UTF-16 BE
( unsigned char ) docBuffer[0] == 0xFE &&
( unsigned char ) docBuffer[1] == 0xFF )
{
docBuffer += 2;
docBufferLen -= 2;
encoding = "UTF-16BE";
}
else if ( docBufferLen >= 2 && //UTF-16 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE )
{
docBuffer += 2;
docBufferLen -= 2;
encoding = "UTF-16LE";
}
else if ( docBufferLen >= 3 && //UTF-8
( unsigned char ) docBuffer[0] == 0xEF &&
( unsigned char ) docBuffer[1] == 0xBB &&
( unsigned char ) docBuffer[2] == 0xBF )
{ {
docBuffer += 3; docBuffer += 3;
docBufferLen -= 3; docBufferLen -= 3;
isUtf8 = true; encoding = "UTF-8";
} }
// no UTF-8 BOM found if ( encoding.empty() )
std::string encoding;
if ( !isUtf8 || !binaryfile->getDataLen() )
{ {
XmlEncodingSpy es; XmlEncodingSpy es;
es.parse ( docBuffer, docBufferLen ); es.parse ( docBuffer, docBufferLen );
encoding = es.getEncoding(); encoding = es.getEncoding();
if ( encoding == "UTF-8" || if ( encoding.empty() ) // Expat couldn't parse file (e.g. UTF-32)
encoding == "utf-8" || encoding = getApproximateEncoding ( docBuffer, docBufferLen );
encoding == "US-ASCII" ||
encoding == "us-ascii" ) // US-ASCII is a subset of UTF-8
isUtf8 = true;
} }
// convert buffer if not UTF-8 // convert buffer if not UTF-8
int nBOM = 0; if ( encoding == "UTF-8" ||
if ( isUtf8 ) encoding == "utf-8" ||
encoding == "US-ASCII" ||
encoding == "us-ascii" || // US-ASCII is a subset of UTF-8
docBufferLen == 0 )
{ {
finalBuffer = docBuffer; finalBuffer = docBuffer;
finalBufferLen = docBufferLen; finalBufferLen = docBufferLen;
isUtf8 = true;
} }
else else
{ {
// clear any other BOMs
if ( docBuffer && // UTF-32 BE
( unsigned char ) docBuffer[0] == 0x00 &&
( unsigned char ) docBuffer[1] == 0x00 &&
( unsigned char ) docBuffer[2] == 0xFE &&
( unsigned char ) docBuffer[3] == 0xFF )
{
nBOM = 4;
}
else if ( docBuffer && // UTF-32 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE &&
( unsigned char ) docBuffer[2] == 0x00 &&
( unsigned char ) docBuffer[3] == 0x00 )
{
nBOM = 4;
}
else if ( docBuffer && //UTF-16 BE
( unsigned char ) docBuffer[0] == 0xFE &&
( unsigned char ) docBuffer[1] == 0xFF )
{
nBOM = 2;
}
else if ( docBuffer && //UTF-16 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE )
{
nBOM = 2;
}
if ( !encoding.size() ) // Expat couldn't parse file (e.g. UTF-32)
{
encoding = getApproximateEncoding ( docBuffer + nBOM, docBufferLen - nBOM );
}
wxString wideEncoding = wxString ( wxString wideEncoding = wxString (
encoding.c_str(), encoding.c_str(),
wxConvLocal, wxConvLocal,
@ -3227,7 +3226,7 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
size_t nconv; size_t nconv;
char *buffer; char *buffer;
size_t iconvBufferLeft, docBufferLeft; size_t iconvBufferLeft, docBufferLeft;
iconvBufferLen = iconvBufferLeft = (docBufferLen - nBOM) * iconvLenMultiplier + 1; iconvBufferLen = iconvBufferLeft = docBufferLen * iconvLenMultiplier + 1;
docBufferLeft = docBufferLen; docBufferLeft = docBufferLen;
iconvBuffer.extend ( iconvBufferLen ); iconvBuffer.extend ( iconvBufferLen );
finalBuffer = buffer = iconvBuffer.data(); // buffer will be incremented by iconv finalBuffer = buffer = iconvBuffer.data(); // buffer will be incremented by iconv
@ -3302,10 +3301,8 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
statusProgress ( _T ( "Parsing document..." ) ); statusProgress ( _T ( "Parsing document..." ) );
std::auto_ptr<WrapExpat> we ( new WrapExpat() ); std::auto_ptr<WrapExpat> we ( new WrapExpat() );
bool optimisedParseSuccess = false;
// omit XML declaration // omit XML declaration
if ( !isUtf8 && finalBufferLen && if ( !isUtf8 && finalBufferLen > 5 &&
finalBuffer[0] == '<' && finalBuffer[0] == '<' &&
finalBuffer[1] == '?' && finalBuffer[1] == '?' &&
finalBuffer[2] == 'x' && finalBuffer[2] == 'x' &&
@ -3323,6 +3320,7 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
} }
} }
bool optimisedParseSuccess = false;
if ( finalBuffer ) if ( finalBuffer )
{ {
optimisedParseSuccess = we->parse ( finalBuffer, finalBufferLen ); optimisedParseSuccess = we->parse ( finalBuffer, finalBufferLen );
@ -3998,6 +3996,11 @@ void MyFrame::OnValidateSchema ( wxCommandEvent& event )
{ {
std::string rawBuffer, schemaLocation; std::string rawBuffer, schemaLocation;
getRawText ( doc, rawBuffer ); getRawText ( doc, rawBuffer );
if ( !XmlEncodingHandler::setUtf8 ( rawBuffer ) )
{
encodingMessage();
return;
}
auto_ptr<XmlSchemaLocator> xsl ( new XmlSchemaLocator() ); auto_ptr<XmlSchemaLocator> xsl ( new XmlSchemaLocator() );
xsl->parse ( rawBuffer.c_str() ); xsl->parse ( rawBuffer.c_str() );
if ( ( xsl->getSchemaLocation() ) . empty() ) if ( ( xsl->getSchemaLocation() ) . empty() )
@ -4034,8 +4037,7 @@ void MyFrame::OnValidateSchema ( wxCommandEvent& event )
doc->clearErrorIndicators(); doc->clearErrorIndicators();
std::auto_ptr<WrapXerces> validator ( new WrapXerces() ); std::auto_ptr<WrapXerces> validator ( new WrapXerces() );
std::string fileNameLocal = ( const char * ) fileName.mb_str ( wxConvLocal ); if ( !validator->validate ( fileName ) )
if ( !validator->validate ( fileNameLocal ) )
{ {
statusProgress ( wxEmptyString ); statusProgress ( wxEmptyString );
messagePane ( validator->getLastError(), CONST_WARNING ); messagePane ( validator->getLastError(), CONST_WARNING );

View File

@ -432,7 +432,7 @@ void XmlPromptGenerator::handleSchema (
parser->setValidationSchemaFullChecking ( true ); parser->setValidationSchemaFullChecking ( true );
Grammar *rootGrammar = parser->loadGrammar Grammar *rootGrammar = parser->loadGrammar
( ( const XMLCh * ) ( const char * ) schemaPath.mb_str ( WrapXerces::getMBConv() ) ( ( const XMLCh * ) WrapXerces::toString ( schemaPath ).GetData()
, Grammar::SchemaGrammarType , Grammar::SchemaGrammarType
); );
if ( !rootGrammar ) if ( !rootGrammar )

View File

@ -203,8 +203,8 @@ void XmlSchemaGenerator::generateData ( const wxString &elementName,
continue; continue;
for ( attrItr = attrMap.begin(); attrItr != attrMap.end(); attrItr++ ) for ( attrItr = attrMap.begin(); attrItr != attrMap.end(); attrItr++ )
{ {
if ( attrs->getNamedItem ( ( const XMLCh * ) ( const char * ) if ( attrs->getNamedItem ( ( const XMLCh * )
attrItr->first.mb_str ( WrapXerces::getMBConv() ) ) == NULL ) WrapXerces::toString ( attrItr->first ).GetData() ) == NULL )
{ {
optAttrs.insert ( attrItr->first ); optAttrs.insert ( attrItr->first );
} }