From d47e52cedc9be93cc60bdf6382009710b0883e08 Mon Sep 17 00:00:00 2001 From: "Zane U. Ji" Date: Sat, 25 Aug 2012 08:19:11 +0800 Subject: [PATCH] Supported creating Schema/DTD from XML --- src/xmlcopyeditor.cpp | 36 +++ src/xmlcopyeditor.h | 2 + src/xmlschemagenerator.cpp | 484 +++++++++++++++++++++++++++++++++++++ src/xmlschemagenerator.h | 101 ++++++++ 4 files changed, 623 insertions(+) create mode 100644 src/xmlschemagenerator.cpp create mode 100644 src/xmlschemagenerator.h diff --git a/src/xmlcopyeditor.cpp b/src/xmlcopyeditor.cpp index 5ac6868..f8cae56 100755 --- a/src/xmlcopyeditor.cpp +++ b/src/xmlcopyeditor.cpp @@ -60,6 +60,7 @@ #include #include #include +#include "xmlschemagenerator.h" #define ngettext wxGetTranslation @@ -135,6 +136,7 @@ BEGIN_EVENT_TABLE ( MyFrame, wxFrame ) EVT_MENU ( ID_CHECK_WELLFORMED, MyFrame::OnCheckWellformedness ) EVT_MENU ( ID_VALIDATE_RELAX_NG, MyFrame::OnValidateRelaxNG ) EVT_MENU ( ID_VALIDATE_W3C_SCHEMA, MyFrame::OnValidateSchema ) + EVT_MENU ( ID_CREATE_SCHEMA, MyFrame::OnCreateSchema ) EVT_MENU ( ID_XPATH, MyFrame::OnXPath ) EVT_MENU_RANGE ( ID_XSLT, ID_XSLT_WORDML_DOCBOOK, MyFrame::OnXslt ) EVT_MENU ( ID_PRETTYPRINT, MyFrame::OnPrettyPrint ) @@ -4039,6 +4041,38 @@ void MyFrame::OnValidateSchema ( wxCommandEvent& event ) documentOk ( _ ( "valid" ) ); } +void MyFrame::OnCreateSchema ( wxCommandEvent& event ) +{ + statusProgress ( wxEmptyString ); + + XmlDoc *doc = getActiveDocument(); + if ( doc == NULL ) + return; + + std::string rawBufferUtf8; + getRawText ( doc, rawBufferUtf8 ); + if ( !XmlEncodingHandler::setUtf8 ( rawBufferUtf8 ) ) + { + encodingMessage(); + return; + } + + int ret = wxMessageBox ( _("Create W3C schema?\n\nYes:\tW3C Schema\nNo:\tDTD"), + _("Schema type"), wxYES_NO | wxCANCEL | wxICON_QUESTION); + if ( ret == wxCANCEL ) return; + + Grammar::GrammarType type = ( ret == wxYES ) ? + Grammar::SchemaGrammarType : Grammar::DTDGrammarType; + std::auto_ptr gen ( new XmlSchemaGenerator() ); + const wxString &schema = gen->generate(type, doc->getFullFileName(), + rawBufferUtf8.c_str(), rawBufferUtf8.size() ); + if (schema.IsEmpty()) { + messagePane ( gen->getLastError(), CONST_WARNING ); + return; + } + newDocument ( schema ); +} + void MyFrame::OnXPath ( wxCommandEvent& event ) { statusProgress ( wxEmptyString ); @@ -5296,6 +5330,8 @@ wxMenuBar *MyFrame::getMenuBar() wxID_ANY, _ ( "&Validate" ), validationMenu ); + xmlMenu->Append ( ID_CREATE_SCHEMA, _ ( "Create &Schema...\tF10" ), + _ ( "Create schema" ) ); xmlMenu->AppendSeparator(); xmlMenu->Append ( wxID_ANY, diff --git a/src/xmlcopyeditor.h b/src/xmlcopyeditor.h index 6ca1b50..befbe40 100755 --- a/src/xmlcopyeditor.h +++ b/src/xmlcopyeditor.h @@ -120,6 +120,7 @@ enum ID_VALIDATE_DTD, ID_VALIDATE_RELAX_NG, ID_VALIDATE_W3C_SCHEMA, + ID_CREATE_SCHEMA, ID_XPATH, ID_XSLT, ID_XSLT_TEI_FO, @@ -286,6 +287,7 @@ class MyFrame : public wxFrame void OnValidateDTD ( wxCommandEvent& event ); void OnValidateRelaxNG ( wxCommandEvent& event ); void OnValidateSchema ( wxCommandEvent& event ); + void OnCreateSchema ( wxCommandEvent& event ); void OnXPath ( wxCommandEvent& event ); void OnXslt ( wxCommandEvent& event ); void OnValidatePreset ( wxCommandEvent& event ); diff --git a/src/xmlschemagenerator.cpp b/src/xmlschemagenerator.cpp new file mode 100644 index 0000000..bf54051 --- /dev/null +++ b/src/xmlschemagenerator.cpp @@ -0,0 +1,484 @@ +/* + * Copyright 2012 Zane U. Ji. + * + * This file is part of Xml Copy Editor. + * + * Xml Copy Editor is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * Xml Copy Editor is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Xml Copy Editor; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xmlschemagenerator.h" +#include "wrapxerces.h" +#include +#include +#include +#include +#include +#include + +XmlSchemaGenerator::XmlSchemaGenerator ( bool inlineSimpleType /*= true*/) + : mInlineSimpleType ( inlineSimpleType ) + , mGrammarType ( Grammar::SchemaGrammarType ) +{ + WrapXerces::Init(); +} + +XmlSchemaGenerator::~XmlSchemaGenerator() +{ +} + +const wxString &XmlSchemaGenerator::generate ( Grammar::GrammarType grammarType, + const wxString &filepath, const char *buffer, size_t len ) +{ + mGrammarType = grammarType; + mElements.clear(); + mSchema.Clear(); + + std::auto_ptr parser ( new XercesDOMParser() ); + parser->setDoNamespaces ( true ); + parser->setDoSchema ( true ); + parser->setValidationSchemaFullChecking ( false ); + + MemBufInputSource source ( ( const XMLByte * ) buffer, len, + filepath.mb_str( wxConvLocal ) ); + try { + //XMLPlatformUtils::fgSSE2ok = false; + parser->parse ( source ); + } + catch ( XMLException& e ) + { + mLastError = WrapXerces::toString ( e.getMessage() ); + return mSchema; + } + + DOMDocument *doc = parser->getDocument(); + if ( doc == NULL ) + { + mLastError = _ ("Failed to load xml file."); + return mSchema; + } + + mSchema << _T("") << getEOL(); + if ( mGrammarType == Grammar::SchemaGrammarType ) + mSchema << _T("") + << getEOL(); + + DOMElement *root = doc->getDocumentElement(); + if ( root != NULL ) + { + findAllElements ( *root ); + generateData ( *root, 1 ); + if ( mInlineSimpleType && mGrammarType == Grammar::SchemaGrammarType ) + outputSchema ( *root ); + } + + if ( mGrammarType == Grammar::SchemaGrammarType ) + mSchema << _T("") << getEOL(); + + return mSchema; +} + +void XmlSchemaGenerator::findAllElements ( const DOMElement &element, + size_t nIndent /*= 0*/) +{ + wxString tagName = WrapXerces::toString ( element.getTagName() ); + mElements[tagName].nodes.insert ( &element ); + + DOMElement *child = element.getFirstElementChild(); + for ( ; child != NULL; child = child->getNextElementSibling() ) + { + findAllElements ( *child, nIndent ); + } +} + +void XmlSchemaGenerator::generateData ( const DOMElement &element, + size_t nIndent /*= 0*/) +{ + wxString name = WrapXerces::toString ( element.getTagName() ); + if ( mElements[name].name.empty() ) + { // Only generate data once + generateData ( name, nIndent ); + } + + DOMElement *child = element.getFirstElementChild(); + for ( ; child != NULL; child = child->getNextElementSibling() ) + { + generateData ( *child, nIndent ); + } +} + +void XmlSchemaGenerator::generateData ( const wxString &elementName, + size_t nIndent /*= 0*/) +{ + ElmtData &data = mElements[elementName]; + std::set::iterator elmtItr; + + data.name = elementName; + + //Content + std::map &childMap = data.children; + std::map::iterator itr; + std::set previous; + elmtItr = data.nodes.begin(); + for ( ; elmtItr != data.nodes.end(); elmtItr++ ) + { + previous.clear(); + + std::map countMap; + DOMElement *child = ( **elmtItr ).getFirstElementChild(); + for ( ; child != NULL; child = child->getNextElementSibling() ) + { + wxString name = WrapXerces::toString ( child->getTagName() ); + childMap[name].prevSiblings.insert ( previous.begin(), previous.end() ); + childMap[name].prevSiblings.erase ( name ); // Don't depend on oneself + previous.insert ( name ); + countMap[name] += 1; + } + std::map::iterator countItr = countMap.begin(); + for ( ; countItr != countMap.end(); countItr++ ) + { + if ( childMap[countItr->first].maxOccurs < countItr->second ) + childMap[countItr->first].maxOccurs = countItr->second; + } + if ( childMap.size() == countMap.size() ) + continue; + for ( itr = childMap.begin(); itr != childMap.end(); itr++ ) + { + if ( countMap.find ( itr->first ) != countMap.end() ) + continue; + itr->second.minOccurs = 0; + } + } + // Attribute + std::map &attrMap = data.attrMap; + std::set &optAttrs = data.optAttrs; + std::map::iterator attrItr; + elmtItr = data.nodes.begin(); + for ( ; elmtItr != data.nodes.end(); elmtItr++ ) + { + if ( ! ( **elmtItr ).hasAttributes() ) + continue; + + wxString name; + DOMAttr *attr; + DOMNamedNodeMap *attrs = ( **elmtItr ).getAttributes(); + size_t i = attrs->getLength(); + while ( i-- > 0 ) + { + attr = ( DOMAttr* ) attrs->item ( i ); + name = WrapXerces::toString ( attr->getName() ); + if ( attr->getPrefix() != NULL ) + { + wxLogDebug ( _T("Ignore: %s"), name.c_str() ); + continue; + } + if ( attr->getSpecified() ) + attrMap[name]; // Initialize attribute map + else + attrMap[name] = attr->getValue(); + } + if ( attrMap.size() == optAttrs.size() ) + continue; + for ( attrItr = attrMap.begin(); attrItr != attrMap.end(); attrItr++ ) + { + const static wxMBConvUTF16 conv; + if ( attrs->getNamedItem ( ( const XMLCh * ) ( const char * ) + attrItr->first.mb_str ( conv ) ) == NULL ) + { + optAttrs.insert ( attrItr->first ); + } + } + } + + // Deal with sequence + wxLogDebug ( _T("%s:"), elementName.c_str() ); + data.useSequence = getSequence ( data.sequence, childMap ); + + // Now we have the data of the element + if ( mGrammarType == Grammar::DTDGrammarType ) + { + generateDTD ( data, nIndent ); + mSchema << data.schema; + } + else if ( !mInlineSimpleType ) + { // Or wait until all data are available + generateSchema ( data, nIndent ); + mSchema << data.schema; + } +} + +void XmlSchemaGenerator::outputSchema ( const DOMElement &element ) +{ + wxString tagName = WrapXerces::toString ( element.getTagName() ); + ElmtData &data = mElements[tagName]; + if ( data.schema.empty() ) + { + if ( mGrammarType == Grammar::SchemaGrammarType ) + generateSchema ( data, 1 ); + else + generateDTD ( data, 1 ); + mSchema << data.schema; + } + + DOMElement *child = element.getFirstElementChild(); + for ( ; child != NULL; child = child->getNextElementSibling() ) + { + outputSchema ( *child ); + } +} + +void XmlSchemaGenerator::generateSchema ( ElmtData &data, size_t nIndent ) +{ + wxString &schema = data.schema; + + if ( data.children.size() == 0 && data.attrMap.size() == 0 ) + { + if ( !mInlineSimpleType ) + { + addIndent ( schema, nIndent ); + schema << _T("") << getEOL(); + } + return; + } + addIndent ( schema, nIndent++ ); + schema << _T("") << getEOL(); + if ( data.children.size() > 0 ) + { + addIndent ( schema, nIndent++ ); + schema << _T("") << getEOL(); + addIndent ( schema, nIndent++ ); + + size_t minOccurs = 1, maxOccurs = 1, minTotal = 0; + std::map::const_iterator itr; + for ( itr = data.children.begin(); itr != data.children.end(); itr++ ) + { + if ( itr->second.minOccurs < minOccurs ) + minOccurs = itr->second.minOccurs; + if ( itr->second.maxOccurs > maxOccurs ) + maxOccurs = itr->second.maxOccurs; + minTotal += itr->second.minOccurs; + } + if ( data.useSequence ) + { + schema << _T(" 1) schema << _T(" maxOccurs=\"unbounded\""); + if ( minTotal == 0 ) schema << _T(" minOccurs=\"0\""); + schema << _T(">") << getEOL(); + + std::vector::const_iterator seqItr; + seqItr = data.sequence.begin(); + for ( ; seqItr != data.sequence.end(); seqItr++ ) + { + const ChildData &child = data.children[*seqItr]; + addIndent ( schema, nIndent ); + if ( mInlineSimpleType ) + { // Check if it's a simple type + const ElmtData *childElmt = &mElements[*seqItr]; + if ( childElmt->children.size() == 0 + && childElmt->attrMap.size() == 0 ) + { + schema << _T("") << getEOL(); + continue; + } + } + schema << _T(" 1 ) + { + schema << _T(" maxOccurs=\"unbounded\""); + } + } + schema << _T("/>") << getEOL(); + } + + addIndent ( schema, --nIndent ); + if ( data.useSequence ) + { + schema << _T("") << getEOL(); + } + else + { + schema << _T("") << getEOL(); + } + } + else if ( data.attrMap.size() != 0 ) + { + addIndent ( schema, nIndent++ ); + schema << _T("") << getEOL(); + } + std::map::const_iterator attrItr; + attrItr = data.attrMap.begin(); + for ( ; attrItr != data.attrMap.end(); attrItr++ ) + { + addIndent ( schema, nIndent ); + schema << _T("first + << _T("\" type=\"xs:string\""); + if ( attrItr->second != NULL ) + { + schema << _T(" default=\"") + << WrapXerces::toString ( attrItr->second ) << _T("\""); + } + else if ( data.optAttrs.find ( attrItr->first ) + == data.optAttrs.end() ) + { + schema << _T(" use=\"required\""); + } + schema << _T("/>") << getEOL(); + } + + addIndent ( schema, --nIndent ); + schema << _T("") << getEOL(); + addIndent ( schema, --nIndent ); + schema << _T("") << getEOL(); +} + +void XmlSchemaGenerator::generateDTD ( ElmtData &data, size_t WXUNUSED ( nIndent ) ) +{ + wxString &schema = data.schema; + + schema << _T("::const_iterator seqItr; + seqItr = data.sequence.begin(); + if (data.useSequence) + { + for ( ; seqItr != data.sequence.end(); seqItr++ ) + { + schema << separator << *seqItr; + separator = _T(", "); + const ChildData &child = data.children[*seqItr]; + if ( child.minOccurs == 0 ) + schema << ( child.maxOccurs > 1 ? _T("*") : _T("?") ); + else if ( child.maxOccurs > 1 ) + schema << _T("+"); + } + schema << _T(")"); + } + else + { + size_t minTotal = 0; + for ( ; seqItr != data.sequence.end(); seqItr++ ) + { + schema << separator << *seqItr; + separator = _T(" | "); + minTotal += data.children[*seqItr].maxOccurs; + } + schema << ( minTotal > 0 ? _T(")+") : _T(")*") ); + } + } + schema << _T(">") << getEOL(); + + if ( !data.attrMap.empty() ) + { + const static wxString indent = + wxString ( getEOL() ) + _T(" "); + schema << _T("::const_iterator attrItr; + attrItr = data.attrMap.begin(); + for ( ; attrItr != data.attrMap.end(); attrItr++ ) + { + schema << indent << attrItr->first << _T(" CDATA"); + if ( attrItr->second != NULL ) // Has default value + schema << _T(" \"") << WrapXerces::toString ( attrItr->second ) << _T("\""); + else if ( data.optAttrs.find ( attrItr->first ) == data.optAttrs.end() ) + schema << _T(" #REQUIRED"); + else + schema << _T(" #IMPLIED"); + } + schema << _T(">") << getEOL(); + } +} + +bool XmlSchemaGenerator::getSequence ( std::vector &sequence, + const std::map &elmtMap ) +{ + bool deadlock = false; + + sequence.clear(); + + std::vector::iterator seqItr, seqFindItr; + std::set::const_iterator prevItr, prevEnd; + std::map::const_iterator itr; + + bool retry; + do + { + retry = false; + for ( itr = elmtMap.begin(); itr != elmtMap.end(); itr++ ) + { + seqFindItr = std::find ( sequence.begin(), sequence.end(), + itr->first ); + if ( seqFindItr != sequence.end() ) + continue; + + seqItr = sequence.begin(); + prevItr = itr->second.prevSiblings.begin(); + prevEnd = itr->second.prevSiblings.end(); + for ( ; prevItr != prevEnd; prevItr++ ) + { // Find last index of dependent elements + seqFindItr = std::find ( sequence.begin(), sequence.end(), + *prevItr ); + if ( seqFindItr != sequence.end() ) + { + if ( seqItr < seqFindItr ) + { + seqItr = seqFindItr; + } + continue; + } + const std::set &previous = + elmtMap.find ( *prevItr )->second.prevSiblings; + if ( previous.find ( itr->first ) == previous.end() ) + { // Not a deadlock + retry = true; + break; + } + else + { + deadlock = true; + } + } + if ( prevItr != prevEnd ) + continue; // The preceding doesn't exist + + if ( seqItr != sequence.end() ) + { + seqItr++; + } + sequence.insert ( seqItr, itr->first ); + wxLogDebug ( _T(" %s"), itr->first.c_str() ); + } + } while ( retry ); + + return !deadlock; +} diff --git a/src/xmlschemagenerator.h b/src/xmlschemagenerator.h new file mode 100644 index 0000000..e2f7094 --- /dev/null +++ b/src/xmlschemagenerator.h @@ -0,0 +1,101 @@ +/* + * Copyright 2012 Zane U. Ji. + * + * This file is part of Xml Copy Editor. + * + * Xml Copy Editor is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * Xml Copy Editor is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Xml Copy Editor; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef XMLSCHEMAGENERATOR_H_ +#define XMLSCHEMAGENERATOR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace xercesc; + +class XmlSchemaGenerator +{ +public: + XmlSchemaGenerator ( bool inlineSimpleType = true ); + virtual ~XmlSchemaGenerator(); + + const wxString &generate ( Grammar::GrammarType grammarType, + const wxString &filepath, const char *buffer, size_t len ); + const wxString &getLastError() { return mLastError; } + + static void addIndent ( wxString &str, size_t nIndent ) + { + for ( size_t i = nIndent; i-- > 0; ) + str << _T(" "); + } + static const wxChar *getEOL() { return wxTextFile::GetEOL(); } + +protected: + class ChildData + { + public: + ChildData() : minOccurs ( 1 ), maxOccurs ( 1 ) {} + size_t minOccurs, maxOccurs; + std::set prevSiblings; + }; + class ElmtData + { + public: + ElmtData() : useSequence ( true ) { } + + // All occurs + std::set nodes; + + // Node name. Also used to indicate if the following data are valid + wxString name; + // These are not needed when we don't need to inline empty elements. + // The schema can be created right after we have all the data. + std::map children; + // Sequence of children + std::vector sequence; + bool useSequence; // Use xs:sequence or xs:choice + // Attribute name and default value + std::map attrMap; + // Optional attributes + std::set optAttrs; + + wxString schema; + }; + + void findAllElements ( const DOMElement &element, size_t nIndent = 0 ); + void generateData ( const DOMElement &element, size_t nIndent = 0 ); + void generateData ( const wxString &elementName, size_t nIndent = 0 ); + void outputSchema ( const DOMElement &element ); + void generateSchema ( ElmtData &data, size_t nIndent ); + void generateDTD ( ElmtData &data, size_t nIndent ); + // Returns false if there is a loop dependence, which means that + // has to be used. + bool getSequence ( std::vector &sequence, + const std::map &elmtMap ); + +protected: + bool mInlineSimpleType; + Grammar::GrammarType mGrammarType; + std::map mElements; + wxString mSchema, mLastError; +}; + +#endif /* XMLSCHEMAGENERATOR_H_ */