Fixed Unicode problems

This commit is contained in:
Zane U. Ji 2013-10-25 19:23:34 +08:00
parent a9916c901e
commit 5310603467
5 changed files with 95 additions and 65 deletions

View File

@ -60,7 +60,7 @@ WrapXerces::~WrapXerces()
delete catalogResolver;
}
bool WrapXerces::validate ( const std::string& fileName )
bool WrapXerces::validate ( const wxString& fileName )
{
SAX2XMLReader *parser = XMLReaderFactory::createXMLReader();
@ -84,7 +84,7 @@ bool WrapXerces::validate ( const std::string& fileName )
try
{
parser->parse ( fileName.c_str() );
parser->parse ( (const XMLCh *) toString ( fileName ).GetData() );
}
catch ( XMLException& e )
{
@ -221,3 +221,22 @@ wxString WrapXerces::toString ( const XMLCh *str )
{
return wxString ( ( const char * ) str, getMBConv() );
}
wxMemoryBuffer WrapXerces::toString ( const wxString &str )
{
const static XMLCh chNull = '\0'; // Xerces-C crashes when the file name is NULL. We'd better return something other than NULL.
wxMemoryBuffer buffer ( 0 );
const size_t lenWC = str.length() + 1; // Plus '\0'. This is important. Otherwise we can call wxString::mb_str(getMBConv()).
size_t lenMB = getMBConv().FromWChar ( NULL, 0, str.c_str(), lenWC );
if ( lenMB == wxCONV_FAILED )
{
buffer.AppendData ( &chNull, sizeof chNull );
return buffer;
}
buffer.SetBufSize ( lenMB );
lenMB = getMBConv().FromWChar ( ( char * ) buffer.GetData(), lenMB, str.c_str(), lenWC );
buffer.SetDataLen ( lenMB );
return buffer;
}

View File

@ -22,6 +22,7 @@
#include <wx/wx.h>
#include <wx/strconv.h>
#include <wx/buffer.h>
#include <string>
#include <utility>
@ -41,14 +42,22 @@ class WrapXerces
static void Init() throw ();
WrapXerces();
virtual ~WrapXerces();
bool validate ( const std::string& fileName );
bool validate ( const wxString &fileName );
bool validateMemory ( const char *buffer, size_t len,
const wxString &system, wxThread *thread = NULL );
const wxString &getLastError();
std::pair<int, int> getErrorPosition();
static const wxMBConv &getMBConv();
static wxString toString ( const XMLCh *str );
// Convert Unicode string to const XMLCh *
//#if wxCHECK_VERSION(2,9,0)
// static wxCharTypeBuffer<XMLCh> toString ( const wxString &str );
//#else
static wxMemoryBuffer toString ( const wxString &str );
//#endif
private:
static const wxMBConv &getMBConv();
XercesCatalogResolver *catalogResolver;
wxString lastError;
std::pair<int, int> errorPosition;

View File

@ -3123,76 +3123,75 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
char *finalBuffer;
size_t finalBufferLen;
// adjust for UTF-8 BOM
if ( docBuffer &&
( unsigned char ) docBuffer[0] == 0xEF &&
( unsigned char ) docBuffer[1] == 0xBB &&
( unsigned char ) docBuffer[2] == 0xBF )
std::string encoding;
if ( docBufferLen >= 4 && // UTF-32 BE
( unsigned char ) docBuffer[0] == 0x00 &&
( unsigned char ) docBuffer[1] == 0x00 &&
( unsigned char ) docBuffer[2] == 0xFE &&
( unsigned char ) docBuffer[3] == 0xFF )
{
docBuffer += 4;
docBufferLen -= 4;
encoding = "UTF-32BE";
}
else if ( docBufferLen >= 4 && // UTF-32 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE &&
( unsigned char ) docBuffer[2] == 0x00 &&
( unsigned char ) docBuffer[3] == 0x00 )
{
docBuffer += 4;
docBufferLen -= 4;
encoding = "UTF-32LE";
}
else if ( docBufferLen >= 2 && //UTF-16 BE
( unsigned char ) docBuffer[0] == 0xFE &&
( unsigned char ) docBuffer[1] == 0xFF )
{
docBuffer += 2;
docBufferLen -= 2;
encoding = "UTF-16BE";
}
else if ( docBufferLen >= 2 && //UTF-16 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE )
{
docBuffer += 2;
docBufferLen -= 2;
encoding = "UTF-16LE";
}
else if ( docBufferLen >= 3 && //UTF-8
( unsigned char ) docBuffer[0] == 0xEF &&
( unsigned char ) docBuffer[1] == 0xBB &&
( unsigned char ) docBuffer[2] == 0xBF )
{
docBuffer += 3;
docBufferLen -= 3;
isUtf8 = true;
encoding = "UTF-8";
}
// no UTF-8 BOM found
std::string encoding;
if ( !isUtf8 || !binaryfile->getDataLen() )
if ( encoding.empty() )
{
XmlEncodingSpy es;
es.parse ( docBuffer, docBufferLen );
encoding = es.getEncoding();
if ( encoding == "UTF-8" ||
encoding == "utf-8" ||
encoding == "US-ASCII" ||
encoding == "us-ascii" ) // US-ASCII is a subset of UTF-8
isUtf8 = true;
if ( encoding.empty() ) // Expat couldn't parse file (e.g. UTF-32)
encoding = getApproximateEncoding ( docBuffer, docBufferLen );
}
// convert buffer if not UTF-8
int nBOM = 0;
if ( isUtf8 )
if ( encoding == "UTF-8" ||
encoding == "utf-8" ||
encoding == "US-ASCII" ||
encoding == "us-ascii" || // US-ASCII is a subset of UTF-8
docBufferLen == 0 )
{
finalBuffer = docBuffer;
finalBufferLen = docBufferLen;
isUtf8 = true;
}
else
{
// clear any other BOMs
if ( docBuffer && // UTF-32 BE
( unsigned char ) docBuffer[0] == 0x00 &&
( unsigned char ) docBuffer[1] == 0x00 &&
( unsigned char ) docBuffer[2] == 0xFE &&
( unsigned char ) docBuffer[3] == 0xFF )
{
nBOM = 4;
}
else if ( docBuffer && // UTF-32 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE &&
( unsigned char ) docBuffer[2] == 0x00 &&
( unsigned char ) docBuffer[3] == 0x00 )
{
nBOM = 4;
}
else if ( docBuffer && //UTF-16 BE
( unsigned char ) docBuffer[0] == 0xFE &&
( unsigned char ) docBuffer[1] == 0xFF )
{
nBOM = 2;
}
else if ( docBuffer && //UTF-16 LE
( unsigned char ) docBuffer[0] == 0xFF &&
( unsigned char ) docBuffer[1] == 0xFE )
{
nBOM = 2;
}
if ( !encoding.size() ) // Expat couldn't parse file (e.g. UTF-32)
{
encoding = getApproximateEncoding ( docBuffer + nBOM, docBufferLen - nBOM );
}
wxString wideEncoding = wxString (
encoding.c_str(),
wxConvLocal,
@ -3227,7 +3226,7 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
size_t nconv;
char *buffer;
size_t iconvBufferLeft, docBufferLeft;
iconvBufferLen = iconvBufferLeft = (docBufferLen - nBOM) * iconvLenMultiplier + 1;
iconvBufferLen = iconvBufferLeft = docBufferLen * iconvLenMultiplier + 1;
docBufferLeft = docBufferLen;
iconvBuffer.extend ( iconvBufferLen );
finalBuffer = buffer = iconvBuffer.data(); // buffer will be incremented by iconv
@ -3302,10 +3301,8 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
statusProgress ( _T ( "Parsing document..." ) );
std::auto_ptr<WrapExpat> we ( new WrapExpat() );
bool optimisedParseSuccess = false;
// omit XML declaration
if ( !isUtf8 && finalBufferLen &&
if ( !isUtf8 && finalBufferLen > 5 &&
finalBuffer[0] == '<' &&
finalBuffer[1] == '?' &&
finalBuffer[2] == 'x' &&
@ -3323,6 +3320,7 @@ bool MyFrame::openFile ( wxString& fileName, bool largeFile )
}
}
bool optimisedParseSuccess = false;
if ( finalBuffer )
{
optimisedParseSuccess = we->parse ( finalBuffer, finalBufferLen );
@ -3998,6 +3996,11 @@ void MyFrame::OnValidateSchema ( wxCommandEvent& event )
{
std::string rawBuffer, schemaLocation;
getRawText ( doc, rawBuffer );
if ( !XmlEncodingHandler::setUtf8 ( rawBuffer ) )
{
encodingMessage();
return;
}
auto_ptr<XmlSchemaLocator> xsl ( new XmlSchemaLocator() );
xsl->parse ( rawBuffer.c_str() );
if ( ( xsl->getSchemaLocation() ) . empty() )
@ -4034,8 +4037,7 @@ void MyFrame::OnValidateSchema ( wxCommandEvent& event )
doc->clearErrorIndicators();
std::auto_ptr<WrapXerces> validator ( new WrapXerces() );
std::string fileNameLocal = ( const char * ) fileName.mb_str ( wxConvLocal );
if ( !validator->validate ( fileNameLocal ) )
if ( !validator->validate ( fileName ) )
{
statusProgress ( wxEmptyString );
messagePane ( validator->getLastError(), CONST_WARNING );

View File

@ -432,7 +432,7 @@ void XmlPromptGenerator::handleSchema (
parser->setValidationSchemaFullChecking ( true );
Grammar *rootGrammar = parser->loadGrammar
( ( const XMLCh * ) ( const char * ) schemaPath.mb_str ( WrapXerces::getMBConv() )
( ( const XMLCh * ) WrapXerces::toString ( schemaPath ).GetData()
, Grammar::SchemaGrammarType
);
if ( !rootGrammar )

View File

@ -203,8 +203,8 @@ void XmlSchemaGenerator::generateData ( const wxString &elementName,
continue;
for ( attrItr = attrMap.begin(); attrItr != attrMap.end(); attrItr++ )
{
if ( attrs->getNamedItem ( ( const XMLCh * ) ( const char * )
attrItr->first.mb_str ( WrapXerces::getMBConv() ) ) == NULL )
if ( attrs->getNamedItem ( ( const XMLCh * )
WrapXerces::toString ( attrItr->first ).GetData() ) == NULL )
{
optAttrs.insert ( attrItr->first );
}