tinyxmlparser.cpp
Go to the documentation of this file.
00001 /*
00002 www.sourceforge.net/projects/tinyxml
00003 Original code by Lee Thomason (www.grinninglizard.com)
00004 
00005 This software is provided 'as-is', without any express or implied 
00006 warranty. In no event will the authors be held liable for any 
00007 damages arising from the use of this software.
00008 
00009 Permission is granted to anyone to use this software for any 
00010 purpose, including commercial applications, and to alter it and 
00011 redistribute it freely, subject to the following restrictions:
00012 
00013 1. The origin of this software must not be misrepresented; you must 
00014 not claim that you wrote the original software. If you use this
00015 software in a product, an acknowledgment in the product documentation
00016 would be appreciated but is not required.
00017 
00018 2. Altered source versions must be plainly marked as such, and 
00019 must not be misrepresented as being the original software.
00020 
00021 3. This notice may not be removed or altered from any source 
00022 distribution.
00023 */
00024 
00025 #include <ctype.h>
00026 #include <stddef.h>
00027 
00028 #include "tinyxml/tinyxml.h"
00029 
00030 //#define DEBUG_PARSER
00031 #if defined( DEBUG_PARSER )
00032 #       if defined( DEBUG ) && defined( _MSC_VER )
00033 #               include <windows.h>
00034 #               define TIXML_LOG OutputDebugString
00035 #       else
00036 #               define TIXML_LOG printf
00037 #       endif
00038 #endif
00039 
00040 // Note tha "PutString" hardcodes the same list. This
00041 // is less flexible than it appears. Changing the entries
00042 // or order will break putstring.       
00043 TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = 
00044 {
00045         { "&amp;",  5, '&' },
00046         { "&lt;",   4, '<' },
00047         { "&gt;",   4, '>' },
00048         { "&quot;", 6, '\"' },
00049         { "&apos;", 6, '\'' }
00050 };
00051 
00052 // Bunch of unicode info at:
00053 //              http://www.unicode.org/faq/utf_bom.html
00054 // Including the basic of this table, which determines the #bytes in the
00055 // sequence from the lead byte. 1 placed for invalid sequences --
00056 // although the result will be junk, pass it through as much as possible.
00057 // Beware of the non-characters in UTF-8:       
00058 //                              ef bb bf (Microsoft "lead bytes")
00059 //                              ef bf be
00060 //                              ef bf bf 
00061 
00062 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
00063 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
00064 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
00065 
00066 const int TiXmlBase::utf8ByteTable[256] = 
00067 {
00068         //      0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
00069                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x00
00070                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x10
00071                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x20
00072                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x30
00073                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x40
00074                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x50
00075                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x60
00076                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x70 End of ASCII range
00077                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x80 0x80 to 0xc1 invalid
00078                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x90 
00079                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xa0 
00080                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xb0 
00081                 1,      1,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xc0 0xc2 to 0xdf 2 byte
00082                 2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xd0
00083                 3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      // 0xe0 0xe0 to 0xef 3 byte
00084                 4,      4,      4,      4,      4,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1       // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
00085 };
00086 
00087 
00088 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
00089 {
00090         const unsigned long BYTE_MASK = 0xBF;
00091         const unsigned long BYTE_MARK = 0x80;
00092         const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00093 
00094         if (input < 0x80) 
00095                 *length = 1;
00096         else if ( input < 0x800 )
00097                 *length = 2;
00098         else if ( input < 0x10000 )
00099                 *length = 3;
00100         else if ( input < 0x200000 )
00101                 *length = 4;
00102         else
00103                 { *length = 0; return; }        // This code won't covert this correctly anyway.
00104 
00105         output += *length;
00106 
00107         // Scary scary fall throughs.
00108         switch (*length) 
00109         {
00110                 case 4:
00111                         --output; 
00112                         *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00113                         input >>= 6;
00114                 case 3:
00115                         --output; 
00116                         *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00117                         input >>= 6;
00118                 case 2:
00119                         --output; 
00120                         *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00121                         input >>= 6;
00122                 case 1:
00123                         --output; 
00124                         *output = (char)(input | FIRST_BYTE_MARK[*length]);
00125         }
00126 }
00127 
00128 
00129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00130 {
00131         // This will only work for low-ascii, everything else is assumed to be a valid
00132         // letter. I'm not sure this is the best approach, but it is quite tricky trying
00133         // to figure out alhabetical vs. not across encoding. So take a very 
00134         // conservative approach.
00135 
00136 //      if ( encoding == TIXML_ENCODING_UTF8 )
00137 //      {
00138                 if ( anyByte < 127 )
00139                         return isalpha( anyByte );
00140                 else
00141                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
00142 //      }
00143 //      else
00144 //      {
00145 //              return isalpha( anyByte );
00146 //      }
00147 }
00148 
00149 
00150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00151 {
00152         // This will only work for low-ascii, everything else is assumed to be a valid
00153         // letter. I'm not sure this is the best approach, but it is quite tricky trying
00154         // to figure out alhabetical vs. not across encoding. So take a very 
00155         // conservative approach.
00156 
00157 //      if ( encoding == TIXML_ENCODING_UTF8 )
00158 //      {
00159                 if ( anyByte < 127 )
00160                         return isalnum( anyByte );
00161                 else
00162                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
00163 //      }
00164 //      else
00165 //      {
00166 //              return isalnum( anyByte );
00167 //      }
00168 }
00169 
00170 
00171 class TiXmlParsingData
00172 {
00173         friend class TiXmlDocument;
00174   public:
00175         void Stamp( const char* now, TiXmlEncoding encoding );
00176 
00177         const TiXmlCursor& Cursor() const       { return cursor; }
00178 
00179   private:
00180         // Only used by the document!
00181         TiXmlParsingData( const char* start, int _tabsize, int row, int col )
00182         {
00183                 assert( start );
00184                 stamp = start;
00185                 tabsize = _tabsize;
00186                 cursor.row = row;
00187                 cursor.col = col;
00188         }
00189 
00190         TiXmlCursor             cursor;
00191         const char*             stamp;
00192         int                             tabsize;
00193 };
00194 
00195 
00196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
00197 {
00198         assert( now );
00199 
00200         // Do nothing if the tabsize is 0.
00201         if ( tabsize < 1 )
00202         {
00203                 return;
00204         }
00205 
00206         // Get the current row, column.
00207         int row = cursor.row;
00208         int col = cursor.col;
00209         const char* p = stamp;
00210         assert( p );
00211 
00212         while ( p < now )
00213         {
00214                 // Treat p as unsigned, so we have a happy compiler.
00215                 const unsigned char* pU = (const unsigned char*)p;
00216 
00217                 // Code contributed by Fletcher Dunn: (modified by lee)
00218                 switch (*pU) {
00219                         case 0:
00220                                 // We *should* never get here, but in case we do, don't
00221                                 // advance past the terminating null character, ever
00222                                 return;
00223 
00224                         case '\r':
00225                                 // bump down to the next line
00226                                 ++row;
00227                                 col = 0;                                
00228                                 // Eat the character
00229                                 ++p;
00230 
00231                                 // Check for \r\n sequence, and treat this as a single character
00232                                 if (*p == '\n') {
00233                                         ++p;
00234                                 }
00235                                 break;
00236 
00237                         case '\n':
00238                                 // bump down to the next line
00239                                 ++row;
00240                                 col = 0;
00241 
00242                                 // Eat the character
00243                                 ++p;
00244 
00245                                 // Check for \n\r sequence, and treat this as a single
00246                                 // character.  (Yes, this bizarre thing does occur still
00247                                 // on some arcane platforms...)
00248                                 if (*p == '\r') {
00249                                         ++p;
00250                                 }
00251                                 break;
00252 
00253                         case '\t':
00254                                 // Eat the character
00255                                 ++p;
00256 
00257                                 // Skip to next tab stop
00258                                 col = (col / tabsize + 1) * tabsize;
00259                                 break;
00260 
00261                         case TIXML_UTF_LEAD_0:
00262                                 if ( encoding == TIXML_ENCODING_UTF8 )
00263                                 {
00264                                         if ( *(p+1) && *(p+2) )
00265                                         {
00266                                                 // In these cases, don't advance the column. These are
00267                                                 // 0-width spaces.
00268                                                 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
00269                                                         p += 3; 
00270                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
00271                                                         p += 3; 
00272                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
00273                                                         p += 3; 
00274                                                 else
00275                                                         { p +=3; ++col; }       // A normal character.
00276                                         }
00277                                 }
00278                                 else
00279                                 {
00280                                         ++p;
00281                                         ++col;
00282                                 }
00283                                 break;
00284 
00285                         default:
00286                                 if ( encoding == TIXML_ENCODING_UTF8 )
00287                                 {
00288                                         // Eat the 1 to 4 byte utf8 character.
00289                                         int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
00290                                         if ( step == 0 )
00291                                                 step = 1;               // Error case from bad encoding, but handle gracefully.
00292                                         p += step;
00293 
00294                                         // Just advance one column, of course.
00295                                         ++col;
00296                                 }
00297                                 else
00298                                 {
00299                                         ++p;
00300                                         ++col;
00301                                 }
00302                                 break;
00303                 }
00304         }
00305         cursor.row = row;
00306         cursor.col = col;
00307         assert( cursor.row >= -1 );
00308         assert( cursor.col >= -1 );
00309         stamp = p;
00310         assert( stamp );
00311 }
00312 
00313 
00314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
00315 {
00316         if ( !p || !*p )
00317         {
00318                 return 0;
00319         }
00320         if ( encoding == TIXML_ENCODING_UTF8 )
00321         {
00322                 while ( *p )
00323                 {
00324                         const unsigned char* pU = (const unsigned char*)p;
00325                         
00326                         // Skip the stupid Microsoft UTF-8 Byte order marks
00327                         if (    *(pU+0)==TIXML_UTF_LEAD_0
00328                                  && *(pU+1)==TIXML_UTF_LEAD_1 
00329                                  && *(pU+2)==TIXML_UTF_LEAD_2 )
00330                         {
00331                                 p += 3;
00332                                 continue;
00333                         }
00334                         else if(*(pU+0)==TIXML_UTF_LEAD_0
00335                                  && *(pU+1)==0xbfU
00336                                  && *(pU+2)==0xbeU )
00337                         {
00338                                 p += 3;
00339                                 continue;
00340                         }
00341                         else if(*(pU+0)==TIXML_UTF_LEAD_0
00342                                  && *(pU+1)==0xbfU
00343                                  && *(pU+2)==0xbfU )
00344                         {
00345                                 p += 3;
00346                                 continue;
00347                         }
00348 
00349                         if ( IsWhiteSpace( *p ) )               // Still using old rules for white space.
00350                                 ++p;
00351                         else
00352                                 break;
00353                 }
00354         }
00355         else
00356         {
00357                 while ( *p && IsWhiteSpace( *p ) )
00358                         ++p;
00359         }
00360 
00361         return p;
00362 }
00363 
00364 #ifdef TIXML_USE_STL
00365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
00366 {
00367         for( ;; )
00368         {
00369                 if ( !in->good() ) return false;
00370 
00371                 int c = in->peek();
00372                 // At this scope, we can't get to a document. So fail silently.
00373                 if ( !IsWhiteSpace( c ) || c <= 0 )
00374                         return true;
00375 
00376                 *tag += (char) in->get();
00377         }
00378 }
00379 
00380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
00381 {
00382         //assert( character > 0 && character < 128 );   // else it won't work in utf-8
00383         while ( in->good() )
00384         {
00385                 int c = in->peek();
00386                 if ( c == character )
00387                         return true;
00388                 if ( c <= 0 )           // Silent failure: can't get document at this scope
00389                         return false;
00390 
00391                 in->get();
00392                 *tag += (char) c;
00393         }
00394         return false;
00395 }
00396 #endif
00397 
00398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
00399 // "assign" optimization removes over 10% of the execution time.
00400 //
00401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
00402 {
00403         // Oddly, not supported on some comilers,
00404         //name->clear();
00405         // So use this:
00406         *name = "";
00407         assert( p );
00408 
00409         // Names start with letters or underscores.
00410         // Of course, in unicode, tinyxml has no idea what a letter *is*. The
00411         // algorithm is generous.
00412         //
00413         // After that, they can be letters, underscores, numbers,
00414         // hyphens, or colons. (Colons are valid ony for namespaces,
00415         // but tinyxml can't tell namespaces from names.)
00416         if (    p && *p 
00417                  && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
00418         {
00419                 const char* start = p;
00420                 while(          p && *p
00421                                 &&      (               IsAlphaNum( (unsigned char ) *p, encoding ) 
00422                                                  || *p == '_'
00423                                                  || *p == '-'
00424                                                  || *p == '.'
00425                                                  || *p == ':' ) )
00426                 {
00427                         //(*name) += *p; // expensive
00428                         ++p;
00429                 }
00430                 if ( p-start > 0 ) {
00431                         name->assign( start, p-start );
00432                 }
00433                 return p;
00434         }
00435         return 0;
00436 }
00437 
00438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
00439 {
00440         // Presume an entity, and pull it out.
00441     TIXML_STRING ent;
00442         int i;
00443         *length = 0;
00444 
00445         if ( *(p+1) && *(p+1) == '#' && *(p+2) )
00446         {
00447                 unsigned long ucs = 0;
00448                 ptrdiff_t delta = 0;
00449                 unsigned mult = 1;
00450 
00451                 if ( *(p+2) == 'x' )
00452                 {
00453                         // Hexadecimal.
00454                         if ( !*(p+3) ) return 0;
00455 
00456                         const char* q = p+3;
00457                         q = strchr( q, ';' );
00458 
00459                         if ( !q || !*q ) return 0;
00460 
00461                         delta = q-p;
00462                         --q;
00463 
00464                         while ( *q != 'x' )
00465                         {
00466                                 if ( *q >= '0' && *q <= '9' )
00467                                         ucs += mult * (*q - '0');
00468                                 else if ( *q >= 'a' && *q <= 'f' )
00469                                         ucs += mult * (*q - 'a' + 10);
00470                                 else if ( *q >= 'A' && *q <= 'F' )
00471                                         ucs += mult * (*q - 'A' + 10 );
00472                                 else 
00473                                         return 0;
00474                                 mult *= 16;
00475                                 --q;
00476                         }
00477                 }
00478                 else
00479                 {
00480                         // Decimal.
00481                         if ( !*(p+2) ) return 0;
00482 
00483                         const char* q = p+2;
00484                         q = strchr( q, ';' );
00485 
00486                         if ( !q || !*q ) return 0;
00487 
00488                         delta = q-p;
00489                         --q;
00490 
00491                         while ( *q != '#' )
00492                         {
00493                                 if ( *q >= '0' && *q <= '9' )
00494                                         ucs += mult * (*q - '0');
00495                                 else 
00496                                         return 0;
00497                                 mult *= 10;
00498                                 --q;
00499                         }
00500                 }
00501                 if ( encoding == TIXML_ENCODING_UTF8 )
00502                 {
00503                         // convert the UCS to UTF-8
00504                         ConvertUTF32ToUTF8( ucs, value, length );
00505                 }
00506                 else
00507                 {
00508                         *value = (char)ucs;
00509                         *length = 1;
00510                 }
00511                 return p + delta + 1;
00512         }
00513 
00514         // Now try to match it.
00515         for( i=0; i<NUM_ENTITY; ++i )
00516         {
00517                 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
00518                 {
00519                         assert( strlen( entity[i].str ) == entity[i].strLength );
00520                         *value = entity[i].chr;
00521                         *length = 1;
00522                         return ( p + entity[i].strLength );
00523                 }
00524         }
00525 
00526         // So it wasn't an entity, its unrecognized, or something like that.
00527         *value = *p;    // Don't put back the last one, since we return it!
00528         //*length = 1;  // Leave unrecognized entities - this doesn't really work.
00529                                         // Just writes strange XML.
00530         return p+1;
00531 }
00532 
00533 
00534 bool TiXmlBase::StringEqual( const char* p,
00535                                                          const char* tag,
00536                                                          bool ignoreCase,
00537                                                          TiXmlEncoding encoding )
00538 {
00539         assert( p );
00540         assert( tag );
00541         if ( !p || !*p )
00542         {
00543                 assert( 0 );
00544                 return false;
00545         }
00546 
00547         const char* q = p;
00548 
00549         if ( ignoreCase )
00550         {
00551                 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
00552                 {
00553                         ++q;
00554                         ++tag;
00555                 }
00556 
00557                 if ( *tag == 0 )
00558                         return true;
00559         }
00560         else
00561         {
00562                 while ( *q && *tag && *q == *tag )
00563                 {
00564                         ++q;
00565                         ++tag;
00566                 }
00567 
00568                 if ( *tag == 0 )                // Have we found the end of the tag, and everything equal?
00569                         return true;
00570         }
00571         return false;
00572 }
00573 
00574 const char* TiXmlBase::ReadText(        const char* p, 
00575                                                                         TIXML_STRING * text, 
00576                                                                         bool trimWhiteSpace, 
00577                                                                         const char* endTag, 
00578                                                                         bool caseInsensitive,
00579                                                                         TiXmlEncoding encoding )
00580 {
00581     *text = "";
00582         if (    !trimWhiteSpace                 // certain tags always keep whitespace
00583                  || !condenseWhiteSpace )       // if true, whitespace is always kept
00584         {
00585                 // Keep all the white space.
00586                 while (    p && *p
00587                                 && !StringEqual( p, endTag, caseInsensitive, encoding )
00588                           )
00589                 {
00590                         int len;
00591                         char cArr[4] = { 0, 0, 0, 0 };
00592                         p = GetChar( p, cArr, &len, encoding );
00593                         text->append( cArr, len );
00594                 }
00595         }
00596         else
00597         {
00598                 bool whitespace = false;
00599 
00600                 // Remove leading white space:
00601                 p = SkipWhiteSpace( p, encoding );
00602                 while (    p && *p
00603                                 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
00604                 {
00605                         if ( *p == '\r' || *p == '\n' )
00606                         {
00607                                 whitespace = true;
00608                                 ++p;
00609                         }
00610                         else if ( IsWhiteSpace( *p ) )
00611                         {
00612                                 whitespace = true;
00613                                 ++p;
00614                         }
00615                         else
00616                         {
00617                                 // If we've found whitespace, add it before the
00618                                 // new character. Any whitespace just becomes a space.
00619                                 if ( whitespace )
00620                                 {
00621                                         (*text) += ' ';
00622                                         whitespace = false;
00623                                 }
00624                                 int len;
00625                                 char cArr[4] = { 0, 0, 0, 0 };
00626                                 p = GetChar( p, cArr, &len, encoding );
00627                                 if ( len == 1 )
00628                                         (*text) += cArr[0];     // more efficient
00629                                 else
00630                                         text->append( cArr, len );
00631                         }
00632                 }
00633         }
00634         if ( p && *p )
00635                 p += strlen( endTag );
00636         return ( p && *p ) ? p : 0;
00637 }
00638 
00639 #ifdef TIXML_USE_STL
00640 
00641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
00642 {
00643         // The basic issue with a document is that we don't know what we're
00644         // streaming. Read something presumed to be a tag (and hope), then
00645         // identify it, and call the appropriate stream method on the tag.
00646         //
00647         // This "pre-streaming" will never read the closing ">" so the
00648         // sub-tag can orient itself.
00649 
00650         if ( !StreamTo( in, '<', tag ) ) 
00651         {
00652                 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00653                 return;
00654         }
00655 
00656         while ( in->good() )
00657         {
00658                 int tagIndex = (int) tag->length();
00659                 while ( in->good() && in->peek() != '>' )
00660                 {
00661                         int c = in->get();
00662                         if ( c <= 0 )
00663                         {
00664                                 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00665                                 break;
00666                         }
00667                         (*tag) += (char) c;
00668                 }
00669 
00670                 if ( in->good() )
00671                 {
00672                         // We now have something we presume to be a node of 
00673                         // some sort. Identify it, and call the node to
00674                         // continue streaming.
00675                         TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
00676 
00677                         if ( node )
00678                         {
00679                                 node->StreamIn( in, tag );
00680                                 bool isElement = node->ToElement() != 0;
00681                                 delete node;
00682                                 node = 0;
00683 
00684                                 // If this is the root element, we're done. Parsing will be
00685                                 // done by the >> operator.
00686                                 if ( isElement )
00687                                 {
00688                                         return;
00689                                 }
00690                         }
00691                         else
00692                         {
00693                                 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00694                                 return;
00695                         }
00696                 }
00697         }
00698         // We should have returned sooner.
00699         SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00700 }
00701 
00702 #endif
00703 
00704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
00705 {
00706         ClearError();
00707 
00708         // Parse away, at the document level. Since a document
00709         // contains nothing but other tags, most of what happens
00710         // here is skipping white space.
00711         if ( !p || !*p )
00712         {
00713                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00714                 return 0;
00715         }
00716 
00717         // Note that, for a document, this needs to come
00718         // before the while space skip, so that parsing
00719         // starts from the pointer we are given.
00720         location.Clear();
00721         if ( prevData )
00722         {
00723                 location.row = prevData->cursor.row;
00724                 location.col = prevData->cursor.col;
00725         }
00726         else
00727         {
00728                 location.row = 0;
00729                 location.col = 0;
00730         }
00731         TiXmlParsingData data( p, TabSize(), location.row, location.col );
00732         location = data.Cursor();
00733 
00734         if ( encoding == TIXML_ENCODING_UNKNOWN )
00735         {
00736                 // Check for the Microsoft UTF-8 lead bytes.
00737                 const unsigned char* pU = (const unsigned char*)p;
00738                 if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
00739                          && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
00740                          && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
00741                 {
00742                         encoding = TIXML_ENCODING_UTF8;
00743                         useMicrosoftBOM = true;
00744                 }
00745         }
00746 
00747     p = SkipWhiteSpace( p, encoding );
00748         if ( !p )
00749         {
00750                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00751                 return 0;
00752         }
00753 
00754         while ( p && *p )
00755         {
00756                 TiXmlNode* node = Identify( p, encoding );
00757                 if ( node )
00758                 {
00759                         p = node->Parse( p, &data, encoding );
00760                         LinkEndChild( node );
00761                 }
00762                 else
00763                 {
00764                         break;
00765                 }
00766 
00767                 // Did we get encoding info?
00768                 if (    encoding == TIXML_ENCODING_UNKNOWN
00769                          && node->ToDeclaration() )
00770                 {
00771                         TiXmlDeclaration* dec = node->ToDeclaration();
00772                         const char* enc = dec->Encoding();
00773                         assert( enc );
00774 
00775                         if ( *enc == 0 )
00776                                 encoding = TIXML_ENCODING_UTF8;
00777                         else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
00778                                 encoding = TIXML_ENCODING_UTF8;
00779                         else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
00780                                 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
00781                         else 
00782                                 encoding = TIXML_ENCODING_LEGACY;
00783                 }
00784 
00785                 p = SkipWhiteSpace( p, encoding );
00786         }
00787 
00788         // Was this empty?
00789         if ( !firstChild ) {
00790                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
00791                 return 0;
00792         }
00793 
00794         // All is well.
00795         return p;
00796 }
00797 
00798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
00799 {       
00800         // The first error in a chain is more accurate - don't set again!
00801         if ( error )
00802                 return;
00803 
00804         assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
00805         error   = true;
00806         errorId = err;
00807         errorDesc = errorString[ errorId ];
00808 
00809         errorLocation.Clear();
00810         if ( pError && data )
00811         {
00812                 data->Stamp( pError, encoding );
00813                 errorLocation = data->Cursor();
00814         }
00815 }
00816 
00817 
00818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
00819 {
00820         TiXmlNode* returnNode = 0;
00821 
00822         p = SkipWhiteSpace( p, encoding );
00823         if( !p || !*p || *p != '<' )
00824         {
00825                 return 0;
00826         }
00827 
00828         p = SkipWhiteSpace( p, encoding );
00829 
00830         if ( !p || !*p )
00831         {
00832                 return 0;
00833         }
00834 
00835         // What is this thing? 
00836         // - Elements start with a letter or underscore, but xml is reserved.
00837         // - Comments: <!--
00838         // - Decleration: <?xml
00839         // - Everthing else is unknown to tinyxml.
00840         //
00841 
00842         const char* xmlHeader = { "<?xml" };
00843         const char* commentHeader = { "<!--" };
00844         const char* dtdHeader = { "<!" };
00845         const char* cdataHeader = { "<![CDATA[" };
00846 
00847         if ( StringEqual( p, xmlHeader, true, encoding ) )
00848         {
00849                 #ifdef DEBUG_PARSER
00850                         TIXML_LOG( "XML parsing Declaration\n" );
00851                 #endif
00852                 returnNode = new TiXmlDeclaration();
00853         }
00854         else if ( StringEqual( p, commentHeader, false, encoding ) )
00855         {
00856                 #ifdef DEBUG_PARSER
00857                         TIXML_LOG( "XML parsing Comment\n" );
00858                 #endif
00859                 returnNode = new TiXmlComment();
00860         }
00861         else if ( StringEqual( p, cdataHeader, false, encoding ) )
00862         {
00863                 #ifdef DEBUG_PARSER
00864                         TIXML_LOG( "XML parsing CDATA\n" );
00865                 #endif
00866                 TiXmlText* text = new TiXmlText( "" );
00867                 text->SetCDATA( true );
00868                 returnNode = text;
00869         }
00870         else if ( StringEqual( p, dtdHeader, false, encoding ) )
00871         {
00872                 #ifdef DEBUG_PARSER
00873                         TIXML_LOG( "XML parsing Unknown(1)\n" );
00874                 #endif
00875                 returnNode = new TiXmlUnknown();
00876         }
00877         else if (    IsAlpha( *(p+1), encoding )
00878                           || *(p+1) == '_' )
00879         {
00880                 #ifdef DEBUG_PARSER
00881                         TIXML_LOG( "XML parsing Element\n" );
00882                 #endif
00883                 returnNode = new TiXmlElement( "" );
00884         }
00885         else
00886         {
00887                 #ifdef DEBUG_PARSER
00888                         TIXML_LOG( "XML parsing Unknown(2)\n" );
00889                 #endif
00890                 returnNode = new TiXmlUnknown();
00891         }
00892 
00893         if ( returnNode )
00894         {
00895                 // Set the parent, so it can report errors
00896                 returnNode->parent = this;
00897         }
00898         return returnNode;
00899 }
00900 
00901 #ifdef TIXML_USE_STL
00902 
00903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
00904 {
00905         // We're called with some amount of pre-parsing. That is, some of "this"
00906         // element is in "tag". Go ahead and stream to the closing ">"
00907         while( in->good() )
00908         {
00909                 int c = in->get();
00910                 if ( c <= 0 )
00911                 {
00912                         TiXmlDocument* document = GetDocument();
00913                         if ( document )
00914                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00915                         return;
00916                 }
00917                 (*tag) += (char) c ;
00918                 
00919                 if ( c == '>' )
00920                         break;
00921         }
00922 
00923         if ( tag->length() < 3 ) return;
00924 
00925         // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
00926         // If not, identify and stream.
00927 
00928         if (    tag->at( tag->length() - 1 ) == '>' 
00929                  && tag->at( tag->length() - 2 ) == '/' )
00930         {
00931                 // All good!
00932                 return;
00933         }
00934         else if ( tag->at( tag->length() - 1 ) == '>' )
00935         {
00936                 // There is more. Could be:
00937                 //              text
00938                 //              cdata text (which looks like another node)
00939                 //              closing tag
00940                 //              another node.
00941                 for ( ;; )
00942                 {
00943                         StreamWhiteSpace( in, tag );
00944 
00945                         // Do we have text?
00946                         if ( in->good() && in->peek() != '<' ) 
00947                         {
00948                                 // Yep, text.
00949                                 TiXmlText text( "" );
00950                                 text.StreamIn( in, tag );
00951 
00952                                 // What follows text is a closing tag or another node.
00953                                 // Go around again and figure it out.
00954                                 continue;
00955                         }
00956 
00957                         // We now have either a closing tag...or another node.
00958                         // We should be at a "<", regardless.
00959                         if ( !in->good() ) return;
00960                         assert( in->peek() == '<' );
00961                         int tagIndex = (int) tag->length();
00962 
00963                         bool closingTag = false;
00964                         bool firstCharFound = false;
00965 
00966                         for( ;; )
00967                         {
00968                                 if ( !in->good() )
00969                                         return;
00970 
00971                                 int c = in->peek();
00972                                 if ( c <= 0 )
00973                                 {
00974                                         TiXmlDocument* document = GetDocument();
00975                                         if ( document )
00976                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00977                                         return;
00978                                 }
00979                                 
00980                                 if ( c == '>' )
00981                                         break;
00982 
00983                                 *tag += (char) c;
00984                                 in->get();
00985 
00986                                 // Early out if we find the CDATA id.
00987                                 if ( c == '[' && tag->size() >= 9 )
00988                                 {
00989                                         size_t len = tag->size();
00990                                         const char* start = tag->c_str() + len - 9;
00991                                         if ( strcmp( start, "<![CDATA[" ) == 0 ) {
00992                                                 assert( !closingTag );
00993                                                 break;
00994                                         }
00995                                 }
00996 
00997                                 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
00998                                 {
00999                                         firstCharFound = true;
01000                                         if ( c == '/' )
01001                                                 closingTag = true;
01002                                 }
01003                         }
01004                         // If it was a closing tag, then read in the closing '>' to clean up the input stream.
01005                         // If it was not, the streaming will be done by the tag.
01006                         if ( closingTag )
01007                         {
01008                                 if ( !in->good() )
01009                                         return;
01010 
01011                                 int c = in->get();
01012                                 if ( c <= 0 )
01013                                 {
01014                                         TiXmlDocument* document = GetDocument();
01015                                         if ( document )
01016                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01017                                         return;
01018                                 }
01019                                 assert( c == '>' );
01020                                 *tag += (char) c;
01021 
01022                                 // We are done, once we've found our closing tag.
01023                                 return;
01024                         }
01025                         else
01026                         {
01027                                 // If not a closing tag, id it, and stream.
01028                                 const char* tagloc = tag->c_str() + tagIndex;
01029                                 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
01030                                 if ( !node )
01031                                         return;
01032                                 node->StreamIn( in, tag );
01033                                 delete node;
01034                                 node = 0;
01035 
01036                                 // No return: go around from the beginning: text, closing tag, or node.
01037                         }
01038                 }
01039         }
01040 }
01041 #endif
01042 
01043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01044 {
01045         p = SkipWhiteSpace( p, encoding );
01046         TiXmlDocument* document = GetDocument();
01047 
01048         if ( !p || !*p )
01049         {
01050                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
01051                 return 0;
01052         }
01053 
01054         if ( data )
01055         {
01056                 data->Stamp( p, encoding );
01057                 location = data->Cursor();
01058         }
01059 
01060         if ( *p != '<' )
01061         {
01062                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
01063                 return 0;
01064         }
01065 
01066         p = SkipWhiteSpace( p+1, encoding );
01067 
01068         // Read the name.
01069         const char* pErr = p;
01070 
01071     p = ReadName( p, &value, encoding );
01072         if ( !p || !*p )
01073         {
01074                 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
01075                 return 0;
01076         }
01077 
01078     TIXML_STRING endTag ("</");
01079         endTag += value;
01080 
01081         // Check for and read attributes. Also look for an empty
01082         // tag or an end tag.
01083         while ( p && *p )
01084         {
01085                 pErr = p;
01086                 p = SkipWhiteSpace( p, encoding );
01087                 if ( !p || !*p )
01088                 {
01089                         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01090                         return 0;
01091                 }
01092                 if ( *p == '/' )
01093                 {
01094                         ++p;
01095                         // Empty tag.
01096                         if ( *p  != '>' )
01097                         {
01098                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );             
01099                                 return 0;
01100                         }
01101                         return (p+1);
01102                 }
01103                 else if ( *p == '>' )
01104                 {
01105                         // Done with attributes (if there were any.)
01106                         // Read the value -- which can include other
01107                         // elements -- read the end tag, and return.
01108                         ++p;
01109                         p = ReadValue( p, data, encoding );             // Note this is an Element method, and will set the error if one happens.
01110                         if ( !p || !*p ) {
01111                                 // We were looking for the end tag, but found nothing.
01112                                 // Fix for [ 1663758 ] Failure to report error on bad XML
01113                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01114                                 return 0;
01115                         }
01116 
01117                         // We should find the end tag now
01118                         // note that:
01119                         // </foo > and
01120                         // </foo> 
01121                         // are both valid end tags.
01122                         if ( StringEqual( p, endTag.c_str(), false, encoding ) )
01123                         {
01124                                 p += endTag.length();
01125                                 p = SkipWhiteSpace( p, encoding );
01126                                 if ( p && *p && *p == '>' ) {
01127                                         ++p;
01128                                         return p;
01129                                 }
01130                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01131                                 return 0;
01132                         }
01133                         else
01134                         {
01135                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01136                                 return 0;
01137                         }
01138                 }
01139                 else
01140                 {
01141                         // Try to read an attribute:
01142                         TiXmlAttribute* attrib = new TiXmlAttribute();
01143                         if ( !attrib )
01144                         {
01145                                 return 0;
01146                         }
01147 
01148                         attrib->SetDocument( document );
01149                         pErr = p;
01150                         p = attrib->Parse( p, data, encoding );
01151 
01152                         if ( !p || !*p )
01153                         {
01154                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01155                                 delete attrib;
01156                                 return 0;
01157                         }
01158 
01159                         // Handle the strange case of double attributes:
01160                         #ifdef TIXML_USE_STL
01161                         TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
01162                         #else
01163                         TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
01164                         #endif
01165                         if ( node )
01166                         {
01167                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01168                                 delete attrib;
01169                                 return 0;
01170                         }
01171 
01172                         attributeSet.Add( attrib );
01173                 }
01174         }
01175         return p;
01176 }
01177 
01178 
01179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01180 {
01181         TiXmlDocument* document = GetDocument();
01182 
01183         // Read in text and elements in any order.
01184         const char* pWithWhiteSpace = p;
01185         p = SkipWhiteSpace( p, encoding );
01186 
01187         while ( p && *p )
01188         {
01189                 if ( *p != '<' )
01190                 {
01191                         // Take what we have, make a text element.
01192                         TiXmlText* textNode = new TiXmlText( "" );
01193 
01194                         if ( !textNode )
01195                         {
01196                             return 0;
01197                         }
01198 
01199                         if ( TiXmlBase::IsWhiteSpaceCondensed() )
01200                         {
01201                                 p = textNode->Parse( p, data, encoding );
01202                         }
01203                         else
01204                         {
01205                                 // Special case: we want to keep the white space
01206                                 // so that leading spaces aren't removed.
01207                                 p = textNode->Parse( pWithWhiteSpace, data, encoding );
01208                         }
01209 
01210                         if ( !textNode->Blank() )
01211                                 LinkEndChild( textNode );
01212                         else
01213                                 delete textNode;
01214                 } 
01215                 else 
01216                 {
01217                         // We hit a '<'
01218                         // Have we hit a new element or an end tag? This could also be
01219                         // a TiXmlText in the "CDATA" style.
01220                         if ( StringEqual( p, "</", false, encoding ) )
01221                         {
01222                                 return p;
01223                         }
01224                         else
01225                         {
01226                                 TiXmlNode* node = Identify( p, encoding );
01227                                 if ( node )
01228                                 {
01229                                         p = node->Parse( p, data, encoding );
01230                                         LinkEndChild( node );
01231                                 }                               
01232                                 else
01233                                 {
01234                                         return 0;
01235                                 }
01236                         }
01237                 }
01238                 pWithWhiteSpace = p;
01239                 p = SkipWhiteSpace( p, encoding );
01240         }
01241 
01242         if ( !p )
01243         {
01244                 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
01245         }       
01246         return p;
01247 }
01248 
01249 
01250 #ifdef TIXML_USE_STL
01251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
01252 {
01253         while ( in->good() )
01254         {
01255                 int c = in->get();      
01256                 if ( c <= 0 )
01257                 {
01258                         TiXmlDocument* document = GetDocument();
01259                         if ( document )
01260                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01261                         return;
01262                 }
01263                 (*tag) += (char) c;
01264 
01265                 if ( c == '>' )
01266                 {
01267                         // All is well.
01268                         return;         
01269                 }
01270         }
01271 }
01272 #endif
01273 
01274 
01275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01276 {
01277         TiXmlDocument* document = GetDocument();
01278         p = SkipWhiteSpace( p, encoding );
01279 
01280         if ( data )
01281         {
01282                 data->Stamp( p, encoding );
01283                 location = data->Cursor();
01284         }
01285         if ( !p || !*p || *p != '<' )
01286         {
01287                 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
01288                 return 0;
01289         }
01290         ++p;
01291     value = "";
01292 
01293         while ( p && *p && *p != '>' )
01294         {
01295                 value += *p;
01296                 ++p;
01297         }
01298 
01299         if ( !p )
01300         {
01301                 if ( document ) 
01302                         document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
01303         }
01304         if ( p && *p == '>' )
01305                 return p+1;
01306         return p;
01307 }
01308 
01309 #ifdef TIXML_USE_STL
01310 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
01311 {
01312         while ( in->good() )
01313         {
01314                 int c = in->get();      
01315                 if ( c <= 0 )
01316                 {
01317                         TiXmlDocument* document = GetDocument();
01318                         if ( document )
01319                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01320                         return;
01321                 }
01322 
01323                 (*tag) += (char) c;
01324 
01325                 if ( c == '>' 
01326                          && tag->at( tag->length() - 2 ) == '-'
01327                          && tag->at( tag->length() - 3 ) == '-' )
01328                 {
01329                         // All is well.
01330                         return;         
01331                 }
01332         }
01333 }
01334 #endif
01335 
01336 
01337 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01338 {
01339         TiXmlDocument* document = GetDocument();
01340         value = "";
01341 
01342         p = SkipWhiteSpace( p, encoding );
01343 
01344         if ( data )
01345         {
01346                 data->Stamp( p, encoding );
01347                 location = data->Cursor();
01348         }
01349         const char* startTag = "<!--";
01350         const char* endTag   = "-->";
01351 
01352         if ( !StringEqual( p, startTag, false, encoding ) )
01353         {
01354                 if ( document )
01355                         document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
01356                 return 0;
01357         }
01358         p += strlen( startTag );
01359 
01360         // [ 1475201 ] TinyXML parses entities in comments
01361         // Oops - ReadText doesn't work, because we don't want to parse the entities.
01362         // p = ReadText( p, &value, false, endTag, false, encoding );
01363         //
01364         // from the XML spec:
01365         /*
01366          [Definition: Comments may appear anywhere in a document outside other markup; in addition, 
01367                       they may appear within the document type declaration at places allowed by the grammar. 
01368                                   They are not part of the document's character data; an XML processor MAY, but need not, 
01369                                   make it possible for an application to retrieve the text of comments. For compatibility, 
01370                                   the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 
01371                                   references MUST NOT be recognized within comments.
01372 
01373                                   An example of a comment:
01374 
01375                                   <!-- declarations for <head> & <body> -->
01376         */
01377 
01378     value = "";
01379         // Keep all the white space.
01380         while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
01381         {
01382                 value.append( p, 1 );
01383                 ++p;
01384         }
01385         if ( p && *p ) 
01386                 p += strlen( endTag );
01387 
01388         return p;
01389 }
01390 
01391 
01392 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01393 {
01394         p = SkipWhiteSpace( p, encoding );
01395         if ( !p || !*p ) return 0;
01396 
01397         if ( data )
01398         {
01399                 data->Stamp( p, encoding );
01400                 location = data->Cursor();
01401         }
01402         // Read the name, the '=' and the value.
01403         const char* pErr = p;
01404         p = ReadName( p, &name, encoding );
01405         if ( !p || !*p )
01406         {
01407                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01408                 return 0;
01409         }
01410         p = SkipWhiteSpace( p, encoding );
01411         if ( !p || !*p || *p != '=' )
01412         {
01413                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01414                 return 0;
01415         }
01416 
01417         ++p;    // skip '='
01418         p = SkipWhiteSpace( p, encoding );
01419         if ( !p || !*p )
01420         {
01421                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01422                 return 0;
01423         }
01424         
01425         const char* end;
01426         const char SINGLE_QUOTE = '\'';
01427         const char DOUBLE_QUOTE = '\"';
01428 
01429         if ( *p == SINGLE_QUOTE )
01430         {
01431                 ++p;
01432                 end = "\'";             // single quote in string
01433                 p = ReadText( p, &value, false, end, false, encoding );
01434         }
01435         else if ( *p == DOUBLE_QUOTE )
01436         {
01437                 ++p;
01438                 end = "\"";             // double quote in string
01439                 p = ReadText( p, &value, false, end, false, encoding );
01440         }
01441         else
01442         {
01443                 // All attribute values should be in single or double quotes.
01444                 // But this is such a common error that the parser will try
01445                 // its best, even without them.
01446                 value = "";
01447                 while (    p && *p                                                                                      // existence
01448                                 && !IsWhiteSpace( *p )                                                          // whitespace
01449                                 && *p != '/' && *p != '>' )                                                     // tag end
01450                 {
01451                         if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
01452                                 // [ 1451649 ] Attribute values with trailing quotes not handled correctly
01453                                 // We did not have an opening quote but seem to have a 
01454                                 // closing one. Give up and throw an error.
01455                                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01456                                 return 0;
01457                         }
01458                         value += *p;
01459                         ++p;
01460                 }
01461         }
01462         return p;
01463 }
01464 
01465 #ifdef TIXML_USE_STL
01466 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
01467 {
01468         while ( in->good() )
01469         {
01470                 int c = in->peek();     
01471                 if ( !cdata && (c == '<' ) ) 
01472                 {
01473                         return;
01474                 }
01475                 if ( c <= 0 )
01476                 {
01477                         TiXmlDocument* document = GetDocument();
01478                         if ( document )
01479                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01480                         return;
01481                 }
01482 
01483                 (*tag) += (char) c;
01484                 in->get();      // "commits" the peek made above
01485 
01486                 if ( cdata && c == '>' && tag->size() >= 3 ) {
01487                         size_t len = tag->size();
01488                         if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
01489                                 // terminator of cdata.
01490                                 return;
01491                         }
01492                 }    
01493         }
01494 }
01495 #endif
01496 
01497 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01498 {
01499         value = "";
01500         TiXmlDocument* document = GetDocument();
01501 
01502         if ( data )
01503         {
01504                 data->Stamp( p, encoding );
01505                 location = data->Cursor();
01506         }
01507 
01508         const char* const startTag = "<![CDATA[";
01509         const char* const endTag   = "]]>";
01510 
01511         if ( cdata || StringEqual( p, startTag, false, encoding ) )
01512         {
01513                 cdata = true;
01514 
01515                 if ( !StringEqual( p, startTag, false, encoding ) )
01516                 {
01517                         if ( document )
01518                                 document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
01519                         return 0;
01520                 }
01521                 p += strlen( startTag );
01522 
01523                 // Keep all the white space, ignore the encoding, etc.
01524                 while (    p && *p
01525                                 && !StringEqual( p, endTag, false, encoding )
01526                           )
01527                 {
01528                         value += *p;
01529                         ++p;
01530                 }
01531 
01532                 TIXML_STRING dummy; 
01533                 p = ReadText( p, &dummy, false, endTag, false, encoding );
01534                 return p;
01535         }
01536         else
01537         {
01538                 bool ignoreWhite = true;
01539 
01540                 const char* end = "<";
01541                 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
01542                 if ( p && *p )
01543                         return p-1;     // don't truncate the '<'
01544                 return 0;
01545         }
01546 }
01547 
01548 #ifdef TIXML_USE_STL
01549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
01550 {
01551         while ( in->good() )
01552         {
01553                 int c = in->get();
01554                 if ( c <= 0 )
01555                 {
01556                         TiXmlDocument* document = GetDocument();
01557                         if ( document )
01558                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01559                         return;
01560                 }
01561                 (*tag) += (char) c;
01562 
01563                 if ( c == '>' )
01564                 {
01565                         // All is well.
01566                         return;
01567                 }
01568         }
01569 }
01570 #endif
01571 
01572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
01573 {
01574         p = SkipWhiteSpace( p, _encoding );
01575         // Find the beginning, find the end, and look for
01576         // the stuff in-between.
01577         TiXmlDocument* document = GetDocument();
01578         if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
01579         {
01580                 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
01581                 return 0;
01582         }
01583         if ( data )
01584         {
01585                 data->Stamp( p, _encoding );
01586                 location = data->Cursor();
01587         }
01588         p += 5;
01589 
01590         version = "";
01591         encoding = "";
01592         standalone = "";
01593 
01594         while ( p && *p )
01595         {
01596                 if ( *p == '>' )
01597                 {
01598                         ++p;
01599                         return p;
01600                 }
01601 
01602                 p = SkipWhiteSpace( p, _encoding );
01603                 if ( StringEqual( p, "version", true, _encoding ) )
01604                 {
01605                         TiXmlAttribute attrib;
01606                         p = attrib.Parse( p, data, _encoding );         
01607                         version = attrib.Value();
01608                 }
01609                 else if ( StringEqual( p, "encoding", true, _encoding ) )
01610                 {
01611                         TiXmlAttribute attrib;
01612                         p = attrib.Parse( p, data, _encoding );         
01613                         encoding = attrib.Value();
01614                 }
01615                 else if ( StringEqual( p, "standalone", true, _encoding ) )
01616                 {
01617                         TiXmlAttribute attrib;
01618                         p = attrib.Parse( p, data, _encoding );         
01619                         standalone = attrib.Value();
01620                 }
01621                 else
01622                 {
01623                         // Read over whatever it is.
01624                         while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
01625                                 ++p;
01626                 }
01627         }
01628         return 0;
01629 }
01630 
01631 bool TiXmlText::Blank() const
01632 {
01633         for ( unsigned i=0; i<value.length(); i++ )
01634                 if ( !IsWhiteSpace( value[i] ) )
01635                         return false;
01636         return true;
01637 }
01638 


sick_scan
Author(s): Michael Lehning , Jochen Sprickerhof , Martin Günther
autogenerated on Tue Jul 9 2019 05:05:35