rtt: tinyxmlparser.cpp Source File

Go to the documentation of this file.
00001 /***************************************************************************
00002   tag: Peter Soetens  do nov 2 13:06:01 CET 2006  tinyxmlparser.cpp
00003 
00004                         tinyxmlparser.cpp -  description
00005                            -------------------
00006     begin                : do november 02 2006
00007     copyright            : (C) 2006 Peter Soetens
00008     email                : peter.soetens@gmail.com
00009 
00010  ***************************************************************************
00011  *   This library is free software; you can redistribute it and/or         *
00012  *   modify it under the terms of the GNU General Public                   *
00013  *   License as published by the Free Software Foundation;                 *
00014  *   version 2 of the License.                                             *
00015  *                                                                         *
00016  *   As a special exception, you may use this file as part of a free       *
00017  *   software library without restriction.  Specifically, if other files   *
00018  *   instantiate templates or use macros or inline functions from this     *
00019  *   file, or you compile this file and link it with other files to        *
00020  *   produce an executable, this file does not by itself cause the         *
00021  *   resulting executable to be covered by the GNU General Public          *
00022  *   License.  This exception does not however invalidate any other        *
00023  *   reasons why the executable file might be covered by the GNU General   *
00024  *   Public License.                                                       *
00025  *                                                                         *
00026  *   This library is distributed in the hope that it will be useful,       *
00027  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00028  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     *
00029  *   Lesser General Public License for more details.                       *
00030  *                                                                         *
00031  *   You should have received a copy of the GNU General Public             *
00032  *   License along with this library; if not, write to the Free Software   *
00033  *   Foundation, Inc., 59 Temple Place,                                    *
00034  *   Suite 330, Boston, MA  02111-1307  USA                                *
00035  *                                                                         *
00036  ***************************************************************************/
00037 
00038 
00039 /*
00040 www.sourceforge.net/projects/tinyxml
00041 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
00042 
00043 This software is provided 'as-is', without any express or implied
00044 warranty. In no event will the authors be held liable for any
00045 damages arising from the use of this software.
00046 
00047 Permission is granted to anyone to use this software for any
00048 purpose, including commercial applications, and to alter it and
00049 redistribute it freely, subject to the following restrictions:
00050 
00051 1. The origin of this software must not be misrepresented; you must
00052 not claim that you wrote the original software. If you use this
00053 software in a product, an acknowledgment in the product documentation
00054 would be appreciated but is not required.
00055 
00056 2. Altered source versions must be plainly marked as such, and
00057 must not be misrepresented as being the original software.
00058 
00059 3. This notice may not be removed or altered from any source
00060 distribution.
00061 */
00062 
00063 #include "tinyxml.h"
00064 #include <ctype.h>
00065 #include <stddef.h>
00066 
00067 //#define DEBUG_PARSER
00068 #if defined( DEBUG_PARSER )
00069 #       if defined( DEBUG ) && defined( _MSC_VER )
00070 #               include <windows.h>
00071 #               define TIXML_LOG OutputDebugString
00072 #       else
00073 #               define TIXML_LOG printf
00074 #       endif
00075 #endif
00076 
00077 namespace RTT { namespace marsh {
00078 
00079 // Note tha "PutString" hardcodes the same list. This
00080 // is less flexible than it appears. Changing the entries
00081 // or order will break putstring.
00082 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
00083 {
00084         { "&amp;",  5, '&' },
00085         { "&lt;",   4, '<' },
00086         { "&gt;",   4, '>' },
00087         { "&quot;", 6, '\"' },
00088         { "&apos;", 6, '\'' }
00089 };
00090 
00091 // Bunch of unicode info at:
00092 //              http://www.unicode.org/faq/utf_bom.html
00093 // Including the basic of this table, which determines the #bytes in the
00094 // sequence from the lead byte. 1 placed for invalid sequences --
00095 // although the result will be junk, pass it through as much as possible.
00096 // Beware of the non-characters in UTF-8:
00097 //                              ef bb bf (Microsoft "lead bytes")
00098 //                              ef bf be
00099 //                              ef bf bf
00100 
00101 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
00102 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
00103 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
00104 
00105 const int TiXmlBase::utf8ByteTable[256] =
00106 {
00107         //      0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
00108                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x00
00109                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x10
00110                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x20
00111                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x30
00112                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x40
00113                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x50
00114                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x60
00115                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x70 End of ASCII range
00116                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x80 0x80 to 0xc1 invalid
00117                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x90
00118                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xa0
00119                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xb0
00120                 1,      1,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xc0 0xc2 to 0xdf 2 byte
00121                 2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xd0
00122                 3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      // 0xe0 0xe0 to 0xef 3 byte
00123                 4,      4,      4,      4,      4,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1       // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
00124 };
00125 
00126 
00127 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
00128 {
00129         const unsigned long BYTE_MASK = 0xBF;
00130         const unsigned long BYTE_MARK = 0x80;
00131         const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00132 
00133         if (input < 0x80)
00134                 *length = 1;
00135         else if ( input < 0x800 )
00136                 *length = 2;
00137         else if ( input < 0x10000 )
00138                 *length = 3;
00139         else if ( input < 0x200000 )
00140                 *length = 4;
00141         else
00142                 { *length = 0; return; }        // This code won't covert this correctly anyway.
00143 
00144         output += *length;
00145 
00146         // Scary scary fall throughs.
00147         switch (*length)
00148         {
00149                 case 4:
00150                         --output;
00151                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
00152                         input >>= 6;
00153                 case 3:
00154                         --output;
00155                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
00156                         input >>= 6;
00157                 case 2:
00158                         --output;
00159                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
00160                         input >>= 6;
00161                 case 1:
00162                         --output;
00163                         *output = (char)(input | FIRST_BYTE_MARK[*length]);
00164         }
00165 }
00166 
00167 
00168 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00169 {
00170         // This will only work for low-ascii, everything else is assumed to be a valid
00171         // letter. I'm not sure this is the best approach, but it is quite tricky trying
00172         // to figure out alhabetical vs. not across encoding. So take a very
00173         // conservative approach.
00174 
00175 //      if ( encoding == TIXML_ENCODING_UTF8 )
00176 //      {
00177                 if ( anyByte < 127 )
00178                         return isalpha( anyByte );
00179                 else
00180                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
00181 //      }
00182 //      else
00183 //      {
00184 //              return isalpha( anyByte );
00185 //      }
00186 }
00187 
00188 
00189 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00190 {
00191         // This will only work for low-ascii, everything else is assumed to be a valid
00192         // letter. I'm not sure this is the best approach, but it is quite tricky trying
00193         // to figure out alhabetical vs. not across encoding. So take a very
00194         // conservative approach.
00195 
00196 //      if ( encoding == TIXML_ENCODING_UTF8 )
00197 //      {
00198                 if ( anyByte < 127 )
00199                         return isalnum( anyByte );
00200                 else
00201                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
00202 //      }
00203 //      else
00204 //      {
00205 //              return isalnum( anyByte );
00206 //      }
00207 }
00208 
00209 
00210 class TiXmlParsingData
00211 {
00212         friend class TiXmlDocument;
00213   public:
00214         void Stamp( const char* now, TiXmlEncoding encoding );
00215 
00216         const TiXmlCursor& Cursor()     { return cursor; }
00217 
00218   private:
00219         // Only used by the document!
00220         TiXmlParsingData( const char* start, int _tabsize, int row, int col )
00221         {
00222                 assert( start );
00223                 stamp = start;
00224                 tabsize = _tabsize;
00225                 cursor.row = row;
00226                 cursor.col = col;
00227         }
00228 
00229         TiXmlCursor             cursor;
00230         const char*             stamp;
00231         int                             tabsize;
00232 };
00233 
00234 
00235 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
00236 {
00237         assert( now );
00238 
00239         // Do nothing if the tabsize is 0.
00240         if ( tabsize < 1 )
00241         {
00242                 return;
00243         }
00244 
00245         // Get the current row, column.
00246         int row = cursor.row;
00247         int col = cursor.col;
00248         const char* p = stamp;
00249         assert( p );
00250 
00251         while ( p < now )
00252         {
00253                 // Treat p as unsigned, so we have a happy compiler.
00254                 const unsigned char* pU = (const unsigned char*)p;
00255 
00256                 // Code contributed by Fletcher Dunn: (modified by lee)
00257                 switch (*pU) {
00258                         case 0:
00259                                 // We *should* never get here, but in case we do, don't
00260                                 // advance past the terminating null character, ever
00261                                 return;
00262 
00263                         case '\r':
00264                                 // bump down to the next line
00265                                 ++row;
00266                                 col = 0;
00267                                 // Eat the character
00268                                 ++p;
00269 
00270                                 // Check for \r\n sequence, and treat this as a single character
00271                                 if (*p == '\n') {
00272                                         ++p;
00273                                 }
00274                                 break;
00275 
00276                         case '\n':
00277                                 // bump down to the next line
00278                                 ++row;
00279                                 col = 0;
00280 
00281                                 // Eat the character
00282                                 ++p;
00283 
00284                                 // Check for \n\r sequence, and treat this as a single
00285                                 // character.  (Yes, this bizarre thing does occur still
00286                                 // on some arcane platforms...)
00287                                 if (*p == '\r') {
00288                                         ++p;
00289                                 }
00290                                 break;
00291 
00292                         case '\t':
00293                                 // Eat the character
00294                                 ++p;
00295 
00296                                 // Skip to next tab stop
00297                                 col = (col / tabsize + 1) * tabsize;
00298                                 break;
00299 
00300                         case TIXML_UTF_LEAD_0:
00301                                 if ( encoding == TIXML_ENCODING_UTF8 )
00302                                 {
00303                                         if ( *(p+1) && *(p+2) )
00304                                         {
00305                                                 // In these cases, don't advance the column. These are
00306                                                 // 0-width spaces.
00307                                                 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
00308                                                         p += 3;
00309                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
00310                                                         p += 3;
00311                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
00312                                                         p += 3;
00313                                                 else
00314                                                         { p +=3; ++col; }       // A normal character.
00315                                         }
00316                                 }
00317                                 else
00318                                 {
00319                                         ++p;
00320                                         ++col;
00321                                 }
00322                                 break;
00323 
00324                         default:
00325                                 if ( encoding == TIXML_ENCODING_UTF8 )
00326                                 {
00327                                         // Eat the 1 to 4 byte utf8 character.
00328                                         int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
00329                                         if ( step == 0 )
00330                                                 step = 1;               // Error case from bad encoding, but handle gracefully.
00331                                         p += step;
00332 
00333                                         // Just advance one column, of course.
00334                                         ++col;
00335                                 }
00336                                 else
00337                                 {
00338                                         ++p;
00339                                         ++col;
00340                                 }
00341                                 break;
00342                 }
00343         }
00344         cursor.row = row;
00345         cursor.col = col;
00346         assert( cursor.row >= -1 );
00347         assert( cursor.col >= -1 );
00348         stamp = p;
00349         assert( stamp );
00350 }
00351 
00352 
00353 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
00354 {
00355         if ( !p || !*p )
00356         {
00357                 return 0;
00358         }
00359         if ( encoding == TIXML_ENCODING_UTF8 )
00360         {
00361                 while ( *p )
00362                 {
00363                         const unsigned char* pU = (const unsigned char*)p;
00364 
00365                         // Skip the stupid Microsoft UTF-8 Byte order marks
00366                         if (    *(pU+0)==TIXML_UTF_LEAD_0
00367                                  && *(pU+1)==TIXML_UTF_LEAD_1
00368                                  && *(pU+2)==TIXML_UTF_LEAD_2 )
00369                         {
00370                                 p += 3;
00371                                 continue;
00372                         }
00373                         else if(*(pU+0)==TIXML_UTF_LEAD_0
00374                                  && *(pU+1)==0xbfU
00375                                  && *(pU+2)==0xbeU )
00376                         {
00377                                 p += 3;
00378                                 continue;
00379                         }
00380                         else if(*(pU+0)==TIXML_UTF_LEAD_0
00381                                  && *(pU+1)==0xbfU
00382                                  && *(pU+2)==0xbfU )
00383                         {
00384                                 p += 3;
00385                                 continue;
00386                         }
00387 
00388                         if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )            // Still using old rules for white space.
00389                                 ++p;
00390                         else
00391                                 break;
00392                 }
00393         }
00394         else
00395         {
00396                 while ( *p && (IsWhiteSpace( *p ) || *p == '\n' || *p =='\r') )
00397                         ++p;
00398         }
00399 
00400         return p;
00401 }
00402 
00403 #ifdef TIXML_USE_STL
00404 /*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
00405 {
00406         for( ;; )
00407         {
00408                 if ( !in->good() ) return false;
00409 
00410                 int c = in->peek();
00411                 // At this scope, we can't get to a document. So fail silently.
00412                 if ( !IsWhiteSpace( c ) || c <= 0 )
00413                         return true;
00414 
00415                 *tag += (char) in->get();
00416         }
00417 }
00418 
00419 /*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
00420 {
00421         //assert( character > 0 && character < 128 );   // else it won't work in utf-8
00422         while ( in->good() )
00423         {
00424                 int c = in->peek();
00425                 if ( c == character )
00426                         return true;
00427                 if ( c <= 0 )           // Silent failure: can't get document at this scope
00428                         return false;
00429 
00430                 in->get();
00431                 *tag += (char) c;
00432         }
00433         return false;
00434 }
00435 #endif
00436 
00437 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
00438 {
00439         *name = "";
00440         assert( p );
00441 
00442         // Names start with letters or underscores.
00443         // Of course, in unicode, tinyxml has no idea what a letter *is*. The
00444         // algorithm is generous.
00445         //
00446         // After that, they can be letters, underscores, numbers,
00447         // hyphens, or colons. (Colons are valid ony for namespaces,
00448         // but tinyxml can't tell namespaces from names.)
00449         if (    p && *p
00450                  && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
00451         {
00452                 while(          p && *p
00453                                 &&      (               IsAlphaNum( (unsigned char ) *p, encoding )
00454                                                  || *p == '_'
00455                                                  || *p == '-'
00456                                                  || *p == '.'
00457                                                  || *p == ':' ) )
00458                 {
00459                         (*name) += *p;
00460                         ++p;
00461                 }
00462                 return p;
00463         }
00464         return 0;
00465 }
00466 
00467 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
00468 {
00469         // Presume an entity, and pull it out.
00470     TIXML_STRING ent;
00471         int i;
00472         *length = 0;
00473 
00474         if ( *(p+1) && *(p+1) == '#' && *(p+2) )
00475         {
00476                 unsigned long ucs = 0;
00477                 ptrdiff_t delta = 0;
00478                 unsigned mult = 1;
00479 
00480                 if ( *(p+2) == 'x' )
00481                 {
00482                         // Hexadecimal.
00483                         if ( !*(p+3) ) return 0;
00484 
00485                         const char* q = p+3;
00486                         q = strchr( q, ';' );
00487 
00488                         if ( !q || !*q ) return 0;
00489 
00490                         delta = q-p;
00491                         --q;
00492 
00493                         while ( *q != 'x' )
00494                         {
00495                                 if ( *q >= '0' && *q <= '9' )
00496                                         ucs += mult * (*q - '0');
00497                                 else if ( *q >= 'a' && *q <= 'f' )
00498                                         ucs += mult * (*q - 'a' + 10);
00499                                 else if ( *q >= 'A' && *q <= 'F' )
00500                                         ucs += mult * (*q - 'A' + 10 );
00501                                 else
00502                                         return 0;
00503                                 mult *= 16;
00504                                 --q;
00505                         }
00506                 }
00507                 else
00508                 {
00509                         // Decimal.
00510                         if ( !*(p+2) ) return 0;
00511 
00512                         const char* q = p+2;
00513                         q = strchr( q, ';' );
00514 
00515                         if ( !q || !*q ) return 0;
00516 
00517                         delta = q-p;
00518                         --q;
00519 
00520                         while ( *q != '#' )
00521                         {
00522                                 if ( *q >= '0' && *q <= '9' )
00523                                         ucs += mult * (*q - '0');
00524                                 else
00525                                         return 0;
00526                                 mult *= 10;
00527                                 --q;
00528                         }
00529                 }
00530                 if ( encoding == TIXML_ENCODING_UTF8 )
00531                 {
00532                         // convert the UCS to UTF-8
00533                         ConvertUTF32ToUTF8( ucs, value, length );
00534                 }
00535                 else
00536                 {
00537                         *value = (char)ucs;
00538                         *length = 1;
00539                 }
00540                 return p + delta + 1;
00541         }
00542 
00543         // Now try to match it.
00544         for( i=0; i<NUM_ENTITY; ++i )
00545         {
00546                 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
00547                 {
00548                         assert( strlen( entity[i].str ) == entity[i].strLength );
00549                         *value = entity[i].chr;
00550                         *length = 1;
00551                         return ( p + entity[i].strLength );
00552                 }
00553         }
00554 
00555         // So it wasn't an entity, its unrecognized, or something like that.
00556         *value = *p;    // Don't put back the last one, since we return it!
00557         return p+1;
00558 }
00559 
00560 
00561 bool TiXmlBase::StringEqual( const char* p,
00562                                                          const char* tag,
00563                                                          bool ignoreCase,
00564                                                          TiXmlEncoding encoding )
00565 {
00566         assert( p );
00567         assert( tag );
00568         if ( !p || !*p )
00569         {
00570                 assert( 0 );
00571                 return false;
00572         }
00573 
00574         const char* q = p;
00575 
00576         if ( ignoreCase )
00577         {
00578                 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
00579                 {
00580                         ++q;
00581                         ++tag;
00582                 }
00583 
00584                 if ( *tag == 0 )
00585                         return true;
00586         }
00587         else
00588         {
00589                 while ( *q && *tag && *q == *tag )
00590                 {
00591                         ++q;
00592                         ++tag;
00593                 }
00594 
00595                 if ( *tag == 0 )                // Have we found the end of the tag, and everything equal?
00596                         return true;
00597         }
00598         return false;
00599 }
00600 
00601 const char* TiXmlBase::ReadText(        const char* p,
00602                                                                         TIXML_STRING * text,
00603                                                                         bool trimWhiteSpace,
00604                                                                         const char* endTag,
00605                                                                         bool caseInsensitive,
00606                                                                         TiXmlEncoding encoding )
00607 {
00608     *text = "";
00609         if (    !trimWhiteSpace                 // certain tags always keep whitespace
00610                  || !condenseWhiteSpace )       // if true, whitespace is always kept
00611         {
00612                 // Keep all the white space.
00613                 while (    p && *p
00614                                 && !StringEqual( p, endTag, caseInsensitive, encoding )
00615                           )
00616                 {
00617                         int len;
00618                         char cArr[4] = { 0, 0, 0, 0 };
00619                         p = GetChar( p, cArr, &len, encoding );
00620                         text->append( cArr, len );
00621                 }
00622         }
00623         else
00624         {
00625                 bool whitespace = false;
00626 
00627                 // Remove leading white space:
00628                 p = SkipWhiteSpace( p, encoding );
00629                 while (    p && *p
00630                                 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
00631                 {
00632                         if ( *p == '\r' || *p == '\n' )
00633                         {
00634                                 whitespace = true;
00635                                 ++p;
00636                         }
00637                         else if ( IsWhiteSpace( *p ) )
00638                         {
00639                                 whitespace = true;
00640                                 ++p;
00641                         }
00642                         else
00643                         {
00644                                 // If we've found whitespace, add it before the
00645                                 // new character. Any whitespace just becomes a space.
00646                                 if ( whitespace )
00647                                 {
00648                                         (*text) += ' ';
00649                                         whitespace = false;
00650                                 }
00651                                 int len;
00652                                 char cArr[4] = { 0, 0, 0, 0 };
00653                                 p = GetChar( p, cArr, &len, encoding );
00654                                 if ( len == 1 )
00655                                         (*text) += cArr[0];     // more efficient
00656                                 else
00657                                         text->append( cArr, len );
00658                         }
00659                 }
00660         }
00661         return p + strlen( endTag );
00662 }
00663 
00664 #ifdef TIXML_USE_STL
00665 
00666 void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
00667 {
00668         // The basic issue with a document is that we don't know what we're
00669         // streaming. Read something presumed to be a tag (and hope), then
00670         // identify it, and call the appropriate stream method on the tag.
00671         //
00672         // This "pre-streaming" will never read the closing ">" so the
00673         // sub-tag can orient itself.
00674 
00675         if ( !StreamTo( in, '<', tag ) )
00676         {
00677                 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00678                 return;
00679         }
00680 
00681         while ( in->good() )
00682         {
00683                 int tagIndex = (int) tag->length();
00684                 while ( in->good() && in->peek() != '>' )
00685                 {
00686                         int c = in->get();
00687                         if ( c <= 0 )
00688                         {
00689                                 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00690                                 break;
00691                         }
00692                         (*tag) += (char) c;
00693                 }
00694 
00695                 if ( in->good() )
00696                 {
00697                         // We now have something we presume to be a node of
00698                         // some sort. Identify it, and call the node to
00699                         // continue streaming.
00700                         TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
00701 
00702                         if ( node )
00703                         {
00704                                 node->StreamIn( in, tag );
00705                                 bool isElement = node->ToElement() != 0;
00706                                 delete node;
00707                                 node = 0;
00708 
00709                                 // If this is the root element, we're done. Parsing will be
00710                                 // done by the >> operator.
00711                                 if ( isElement )
00712                                 {
00713                                         return;
00714                                 }
00715                         }
00716                         else
00717                         {
00718                                 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00719                                 return;
00720                         }
00721                 }
00722         }
00723         // We should have returned sooner.
00724         SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00725 }
00726 
00727 #endif
00728 
00729 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
00730 {
00731         ClearError();
00732 
00733         // Parse away, at the document level. Since a document
00734         // contains nothing but other tags, most of what happens
00735         // here is skipping white space.
00736         if ( !p || !*p )
00737         {
00738                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00739                 return 0;
00740         }
00741 
00742         // Note that, for a document, this needs to come
00743         // before the while space skip, so that parsing
00744         // starts from the pointer we are given.
00745         location.Clear();
00746         if ( prevData )
00747         {
00748                 location.row = prevData->cursor.row;
00749                 location.col = prevData->cursor.col;
00750         }
00751         else
00752         {
00753                 location.row = 0;
00754                 location.col = 0;
00755         }
00756         TiXmlParsingData data( p, TabSize(), location.row, location.col );
00757         location = data.Cursor();
00758 
00759         if ( encoding == TIXML_ENCODING_UNKNOWN )
00760         {
00761                 // Check for the Microsoft UTF-8 lead bytes.
00762                 const unsigned char* pU = (const unsigned char*)p;
00763                 if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
00764                          && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
00765                          && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
00766                 {
00767                         encoding = TIXML_ENCODING_UTF8;
00768                         useMicrosoftBOM = true;
00769                 }
00770         }
00771 
00772     p = SkipWhiteSpace( p, encoding );
00773         if ( !p )
00774         {
00775                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00776                 return 0;
00777         }
00778 
00779         while ( p && *p )
00780         {
00781                 TiXmlNode* node = Identify( p, encoding );
00782                 if ( node )
00783                 {
00784                         p = node->Parse( p, &data, encoding );
00785                         LinkEndChild( node );
00786                 }
00787                 else
00788                 {
00789                         break;
00790                 }
00791 
00792                 // Did we get encoding info?
00793                 if (    encoding == TIXML_ENCODING_UNKNOWN
00794                          && node->ToDeclaration() )
00795                 {
00796                         TiXmlDeclaration* dec = node->ToDeclaration();
00797                         const char* enc = dec->Encoding();
00798                         assert( enc );
00799 
00800                         if ( *enc == 0 )
00801                                 encoding = TIXML_ENCODING_UTF8;
00802                         else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
00803                                 encoding = TIXML_ENCODING_UTF8;
00804                         else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
00805                                 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
00806                         else
00807                                 encoding = TIXML_ENCODING_LEGACY;
00808                 }
00809 
00810                 p = SkipWhiteSpace( p, encoding );
00811         }
00812 
00813         // Was this empty?
00814         if ( !firstChild ) {
00815                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
00816                 return 0;
00817         }
00818 
00819         // All is well.
00820         return p;
00821 }
00822 
00823 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
00824 {
00825         // The first error in a chain is more accurate - don't set again!
00826         if ( error )
00827                 return;
00828 
00829         assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
00830         error   = true;
00831         errorId = err;
00832         errorDesc = errorString[ errorId ];
00833 
00834         errorLocation.Clear();
00835         if ( pError && data )
00836         {
00837                 data->Stamp( pError, encoding );
00838                 errorLocation = data->Cursor();
00839         }
00840 }
00841 
00842 
00843 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
00844 {
00845         TiXmlNode* returnNode = 0;
00846 
00847         p = SkipWhiteSpace( p, encoding );
00848         if( !p || !*p || *p != '<' )
00849         {
00850                 return 0;
00851         }
00852 
00853         TiXmlDocument* doc = GetDocument();
00854         p = SkipWhiteSpace( p, encoding );
00855 
00856         if ( !p || !*p )
00857         {
00858                 return 0;
00859         }
00860 
00861         // What is this thing?
00862         // - Elements start with a letter or underscore, but xml is reserved.
00863         // - Comments: <!--
00864         // - Decleration: <?xml
00865         // - Everthing else is unknown to tinyxml.
00866         //
00867 
00868         const char* xmlHeader = { "<?xml" };
00869         const char* commentHeader = { "<!--" };
00870         const char* dtdHeader = { "<!" };
00871         const char* cdataHeader = { "<![CDATA[" };
00872 
00873         if ( StringEqual( p, xmlHeader, true, encoding ) )
00874         {
00875                 #ifdef DEBUG_PARSER
00876                         TIXML_LOG( "XML parsing Declaration\n" );
00877                 #endif
00878                 returnNode = new TiXmlDeclaration();
00879         }
00880         else if ( StringEqual( p, commentHeader, false, encoding ) )
00881         {
00882                 #ifdef DEBUG_PARSER
00883                         TIXML_LOG( "XML parsing Comment\n" );
00884                 #endif
00885                 returnNode = new TiXmlComment();
00886         }
00887         else if ( StringEqual( p, cdataHeader, false, encoding ) )
00888         {
00889                 #ifdef DEBUG_PARSER
00890                         TIXML_LOG( "XML parsing CDATA\n" );
00891                 #endif
00892                 TiXmlText* text = new TiXmlText( "" );
00893                 text->SetCDATA( true );
00894                 returnNode = text;
00895         }
00896         else if ( StringEqual( p, dtdHeader, false, encoding ) )
00897         {
00898                 #ifdef DEBUG_PARSER
00899                         TIXML_LOG( "XML parsing Unknown(1)\n" );
00900                 #endif
00901                 returnNode = new TiXmlUnknown();
00902         }
00903         else if (    IsAlpha( *(p+1), encoding )
00904                           || *(p+1) == '_' )
00905         {
00906                 #ifdef DEBUG_PARSER
00907                         TIXML_LOG( "XML parsing Element\n" );
00908                 #endif
00909                 returnNode = new TiXmlElement( "" );
00910         }
00911         else
00912         {
00913                 #ifdef DEBUG_PARSER
00914                         TIXML_LOG( "XML parsing Unknown(2)\n" );
00915                 #endif
00916                 returnNode = new TiXmlUnknown();
00917         }
00918 
00919         if ( returnNode )
00920         {
00921                 // Set the parent, so it can report errors
00922                 returnNode->parent = this;
00923         }
00924         else
00925         {
00926                 if ( doc )
00927                         doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
00928         }
00929         return returnNode;
00930 }
00931 
00932 #ifdef TIXML_USE_STL
00933 
00934 void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
00935 {
00936         // We're called with some amount of pre-parsing. That is, some of "this"
00937         // element is in "tag". Go ahead and stream to the closing ">"
00938         while( in->good() )
00939         {
00940                 int c = in->get();
00941                 if ( c <= 0 )
00942                 {
00943                         TiXmlDocument* document = GetDocument();
00944                         if ( document )
00945                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00946                         return;
00947                 }
00948                 (*tag) += (char) c ;
00949 
00950                 if ( c == '>' )
00951                         break;
00952         }
00953 
00954         if ( tag->length() < 3 ) return;
00955 
00956         // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
00957         // If not, identify and stream.
00958 
00959         if (    tag->at( tag->length() - 1 ) == '>'
00960                  && tag->at( tag->length() - 2 ) == '/' )
00961         {
00962                 // All good!
00963                 return;
00964         }
00965         else if ( tag->at( tag->length() - 1 ) == '>' )
00966         {
00967                 // There is more. Could be:
00968                 //              text
00969                 //              closing tag
00970                 //              another node.
00971                 for ( ;; )
00972                 {
00973                         StreamWhiteSpace( in, tag );
00974 
00975                         // Do we have text?
00976                         if ( in->good() && in->peek() != '<' )
00977                         {
00978                                 // Yep, text.
00979                                 TiXmlText text( "" );
00980                                 text.StreamIn( in, tag );
00981 
00982                                 // What follows text is a closing tag or another node.
00983                                 // Go around again and figure it out.
00984                                 continue;
00985                         }
00986 
00987                         // We now have either a closing tag...or another node.
00988                         // We should be at a "<", regardless.
00989                         if ( !in->good() ) return;
00990                         assert( in->peek() == '<' );
00991                         int tagIndex = (int) tag->length();
00992 
00993                         bool closingTag = false;
00994                         bool firstCharFound = false;
00995 
00996                         for( ;; )
00997                         {
00998                                 if ( !in->good() )
00999                                         return;
01000 
01001                                 int c = in->peek();
01002                                 if ( c <= 0 )
01003                                 {
01004                                         TiXmlDocument* document = GetDocument();
01005                                         if ( document )
01006                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01007                                         return;
01008                                 }
01009 
01010                                 if ( c == '>' )
01011                                         break;
01012 
01013                                 *tag += (char) c;
01014                                 in->get();
01015 
01016                                 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
01017                                 {
01018                                         firstCharFound = true;
01019                                         if ( c == '/' )
01020                                                 closingTag = true;
01021                                 }
01022                         }
01023                         // If it was a closing tag, then read in the closing '>' to clean up the input stream.
01024                         // If it was not, the streaming will be done by the tag.
01025                         if ( closingTag )
01026                         {
01027                                 if ( !in->good() )
01028                                         return;
01029 
01030                                 int c = in->get();
01031                                 if ( c <= 0 )
01032                                 {
01033                                         TiXmlDocument* document = GetDocument();
01034                                         if ( document )
01035                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01036                                         return;
01037                                 }
01038                                 assert( c == '>' );
01039                                 *tag += (char) c;
01040 
01041                                 // We are done, once we've found our closing tag.
01042                                 return;
01043                         }
01044                         else
01045                         {
01046                                 // If not a closing tag, id it, and stream.
01047                                 const char* tagloc = tag->c_str() + tagIndex;
01048                                 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
01049                                 if ( !node )
01050                                         return;
01051                                 node->StreamIn( in, tag );
01052                                 delete node;
01053                                 node = 0;
01054 
01055                                 // No return: go around from the beginning: text, closing tag, or node.
01056                         }
01057                 }
01058         }
01059 }
01060 #endif
01061 
01062 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01063 {
01064         p = SkipWhiteSpace( p, encoding );
01065         TiXmlDocument* document = GetDocument();
01066 
01067         if ( !p || !*p )
01068         {
01069                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
01070                 return 0;
01071         }
01072 
01073         if ( data )
01074         {
01075                 data->Stamp( p, encoding );
01076                 location = data->Cursor();
01077         }
01078 
01079         if ( *p != '<' )
01080         {
01081                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
01082                 return 0;
01083         }
01084 
01085         p = SkipWhiteSpace( p+1, encoding );
01086 
01087         // Read the name.
01088         const char* pErr = p;
01089 
01090     p = ReadName( p, &value, encoding );
01091         if ( !p || !*p )
01092         {
01093                 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
01094                 return 0;
01095         }
01096 
01097     TIXML_STRING endTag ("</");
01098         endTag += value;
01099         endTag += ">";
01100 
01101         // Check for and read attributes. Also look for an empty
01102         // tag or an end tag.
01103         while ( p && *p )
01104         {
01105                 pErr = p;
01106                 p = SkipWhiteSpace( p, encoding );
01107                 if ( !p || !*p )
01108                 {
01109                         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01110                         return 0;
01111                 }
01112                 if ( *p == '/' )
01113                 {
01114                         ++p;
01115                         // Empty tag.
01116                         if ( *p  != '>' )
01117                         {
01118                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
01119                                 return 0;
01120                         }
01121                         return (p+1);
01122                 }
01123                 else if ( *p == '>' )
01124                 {
01125                         // Done with attributes (if there were any.)
01126                         // Read the value -- which can include other
01127                         // elements -- read the end tag, and return.
01128                         ++p;
01129                         p = ReadValue( p, data, encoding );             // Note this is an Element method, and will set the error if one happens.
01130                         if ( !p || !*p )
01131                                 return 0;
01132 
01133                         // We should find the end tag now
01134                         if ( StringEqual( p, endTag.c_str(), false, encoding ) )
01135                         {
01136                                 p += endTag.length();
01137                                 return p;
01138                         }
01139                         else
01140                         {
01141                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01142                                 return 0;
01143                         }
01144                 }
01145                 else
01146                 {
01147                         // Try to read an attribute:
01148                         TiXmlAttribute* attrib = new TiXmlAttribute();
01149                         if ( !attrib )
01150                         {
01151                                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
01152                                 return 0;
01153                         }
01154 
01155                         attrib->SetDocument( document );
01156                         const char* pErr = p;
01157                         p = attrib->Parse( p, data, encoding );
01158 
01159                         if ( !p || !*p )
01160                         {
01161                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01162                                 delete attrib;
01163                                 return 0;
01164                         }
01165 
01166                         // Handle the strange case of double attributes:
01167                         TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
01168                         if ( node )
01169                         {
01170                                 node->SetValue( attrib->Value() );
01171                                 delete attrib;
01172                                 return 0;
01173                         }
01174 
01175                         attributeSet.Add( attrib );
01176                 }
01177         }
01178         return p;
01179 }
01180 
01181 
01182 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01183 {
01184         TiXmlDocument* document = GetDocument();
01185 
01186         // Read in text and elements in any order.
01187         const char* pWithWhiteSpace = p;
01188         p = SkipWhiteSpace( p, encoding );
01189 
01190         while ( p && *p )
01191         {
01192                 if ( *p != '<' )
01193                 {
01194                         // Take what we have, make a text element.
01195                         TiXmlText* textNode = new TiXmlText( "" );
01196 
01197                         if ( !textNode )
01198                         {
01199                                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
01200                                     return 0;
01201                         }
01202 
01203                         if ( TiXmlBase::IsWhiteSpaceCondensed() )
01204                         {
01205                                 p = textNode->Parse( p, data, encoding );
01206                         }
01207                         else
01208                         {
01209                                 // Special case: we want to keep the white space
01210                                 // so that leading spaces aren't removed.
01211                                 p = textNode->Parse( pWithWhiteSpace, data, encoding );
01212                         }
01213 
01214                         if ( !textNode->Blank() )
01215                                 LinkEndChild( textNode );
01216                         else
01217                                 delete textNode;
01218                 }
01219                 else
01220                 {
01221                         // We hit a '<'
01222                         // Have we hit a new element or an end tag? This could also be
01223                         // a TiXmlText in the "CDATA" style.
01224                         if ( StringEqual( p, "</", false, encoding ) )
01225                         {
01226                                 return p;
01227                         }
01228                         else
01229                         {
01230                                 TiXmlNode* node = Identify( p, encoding );
01231                                 if ( node )
01232                                 {
01233                                         p = node->Parse( p, data, encoding );
01234                                         LinkEndChild( node );
01235                                 }
01236                                 else
01237                                 {
01238                                         return 0;
01239                                 }
01240                         }
01241                 }
01242                 pWithWhiteSpace = p;
01243                 p = SkipWhiteSpace( p, encoding );
01244         }
01245 
01246         if ( !p )
01247         {
01248                 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
01249         }
01250         return p;
01251 }
01252 
01253 
01254 #ifdef TIXML_USE_STL
01255 void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
01256 {
01257         while ( in->good() )
01258         {
01259                 int c = in->get();
01260                 if ( c <= 0 )
01261                 {
01262                         TiXmlDocument* document = GetDocument();
01263                         if ( document )
01264                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01265                         return;
01266                 }
01267                 (*tag) += (char) c;
01268 
01269                 if ( c == '>' )
01270                 {
01271                         // All is well.
01272                         return;
01273                 }
01274         }
01275 }
01276 #endif
01277 
01278 
01279 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01280 {
01281         TiXmlDocument* document = GetDocument();
01282         p = SkipWhiteSpace( p, encoding );
01283 
01284         if ( data )
01285         {
01286                 data->Stamp( p, encoding );
01287                 location = data->Cursor();
01288         }
01289         if ( !p || !*p || *p != '<' )
01290         {
01291                 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
01292                 return 0;
01293         }
01294         ++p;
01295     value = "";
01296 
01297         while ( p && *p && *p != '>' )
01298         {
01299                 value += *p;
01300                 ++p;
01301         }
01302 
01303         if ( !p )
01304         {
01305                 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
01306         }
01307         if ( *p == '>' )
01308                 return p+1;
01309         return p;
01310 }
01311 
01312 #ifdef TIXML_USE_STL
01313 void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
01314 {
01315         while ( in->good() )
01316         {
01317                 int c = in->get();
01318                 if ( c <= 0 )
01319                 {
01320                         TiXmlDocument* document = GetDocument();
01321                         if ( document )
01322                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01323                         return;
01324                 }
01325 
01326                 (*tag) += (char) c;
01327 
01328                 if ( c == '>'
01329                          && tag->at( tag->length() - 2 ) == '-'
01330                          && tag->at( tag->length() - 3 ) == '-' )
01331                 {
01332                         // All is well.
01333                         return;
01334                 }
01335         }
01336 }
01337 #endif
01338 
01339 
01340 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01341 {
01342         TiXmlDocument* document = GetDocument();
01343         value = "";
01344 
01345         p = SkipWhiteSpace( p, encoding );
01346 
01347         if ( data )
01348         {
01349                 data->Stamp( p, encoding );
01350                 location = data->Cursor();
01351         }
01352         const char* startTag = "<!--";
01353         const char* endTag   = "-->";
01354 
01355         if ( !StringEqual( p, startTag, false, encoding ) )
01356         {
01357                 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
01358                 return 0;
01359         }
01360         p += strlen( startTag );
01361         p = ReadText( p, &value, false, endTag, false, encoding );
01362         return p;
01363 }
01364 
01365 
01366 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01367 {
01368         p = SkipWhiteSpace( p, encoding );
01369         if ( !p || !*p ) return 0;
01370 
01371         int tabsize = 4;
01372         if ( document )
01373                 tabsize = document->TabSize();
01374 
01375         if ( data )
01376         {
01377                 data->Stamp( p, encoding );
01378                 location = data->Cursor();
01379         }
01380         // Read the name, the '=' and the value.
01381         const char* pErr = p;
01382         p = ReadName( p, &name, encoding );
01383         if ( !p || !*p )
01384         {
01385                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01386                 return 0;
01387         }
01388         p = SkipWhiteSpace( p, encoding );
01389         if ( !p || !*p || *p != '=' )
01390         {
01391                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01392                 return 0;
01393         }
01394 
01395         ++p;    // skip '='
01396         p = SkipWhiteSpace( p, encoding );
01397         if ( !p || !*p )
01398         {
01399                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01400                 return 0;
01401         }
01402 
01403         const char* end;
01404 
01405         if ( *p == '\'' )
01406         {
01407                 ++p;
01408                 end = "\'";
01409                 p = ReadText( p, &value, false, end, false, encoding );
01410         }
01411         else if ( *p == '"' )
01412         {
01413                 ++p;
01414                 end = "\"";
01415                 p = ReadText( p, &value, false, end, false, encoding );
01416         }
01417         else
01418         {
01419                 // All attribute values should be in single or double quotes.
01420                 // But this is such a common error that the parser will try
01421                 // its best, even without them.
01422                 value = "";
01423                 while (    p && *p                                                                              // existence
01424                                 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'      // whitespace
01425                                 && *p != '/' && *p != '>' )                                             // tag end
01426                 {
01427                         value += *p;
01428                         ++p;
01429                 }
01430         }
01431         return p;
01432 }
01433 
01434 #ifdef TIXML_USE_STL
01435 void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
01436 {
01437         if ( cdata )
01438         {
01439                 int c = in->get();
01440                 if ( c <= 0 )
01441                 {
01442                         TiXmlDocument* document = GetDocument();
01443                         if ( document )
01444                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01445                         return;
01446                 }
01447 
01448                 (*tag) += (char) c;
01449 
01450                 if ( c == '>'
01451                          && tag->at( tag->length() - 2 ) == ']'
01452                          && tag->at( tag->length() - 3 ) == ']' )
01453                 {
01454                         // All is well.
01455                         return;
01456                 }
01457         }
01458         else
01459         {
01460                 while ( in->good() )
01461                 {
01462                         int c = in->peek();
01463                         if ( c == '<' )
01464                                 return;
01465                         if ( c <= 0 )
01466                         {
01467                                 TiXmlDocument* document = GetDocument();
01468                                 if ( document )
01469                                         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01470                                 return;
01471                         }
01472 
01473                         (*tag) += (char) c;
01474                         in->get();
01475                 }
01476         }
01477 }
01478 #endif
01479 
01480 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01481 {
01482         value = "";
01483         TiXmlDocument* document = GetDocument();
01484 
01485         if ( data )
01486         {
01487                 data->Stamp( p, encoding );
01488                 location = data->Cursor();
01489         }
01490 
01491         const char* const startTag = "<![CDATA[";
01492         const char* const endTag   = "]]>";
01493 
01494         if ( cdata || StringEqual( p, startTag, false, encoding ) )
01495         {
01496                 cdata = true;
01497 
01498                 if ( !StringEqual( p, startTag, false, encoding ) )
01499                 {
01500                         document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
01501                         return 0;
01502                 }
01503                 p += strlen( startTag );
01504 
01505                 // Keep all the white space, ignore the encoding, etc.
01506                 while (    p && *p
01507                                 && !StringEqual( p, endTag, false, encoding )
01508                           )
01509                 {
01510                         value += *p;
01511                         ++p;
01512                 }
01513 
01514                 TIXML_STRING dummy;
01515                 p = ReadText( p, &dummy, false, endTag, false, encoding );
01516                 return p;
01517         }
01518         else
01519         {
01520                 bool ignoreWhite = true;
01521 
01522                 const char* end = "<";
01523                 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
01524                 if ( p )
01525                         return p-1;     // don't truncate the '<'
01526                 return 0;
01527         }
01528 }
01529 
01530 #ifdef TIXML_USE_STL
01531 void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
01532 {
01533         while ( in->good() )
01534         {
01535                 int c = in->get();
01536                 if ( c <= 0 )
01537                 {
01538                         TiXmlDocument* document = GetDocument();
01539                         if ( document )
01540                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01541                         return;
01542                 }
01543                 (*tag) += (char) c;
01544 
01545                 if ( c == '>' )
01546                 {
01547                         // All is well.
01548                         return;
01549                 }
01550         }
01551 }
01552 #endif
01553 
01554 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
01555 {
01556         p = SkipWhiteSpace( p, _encoding );
01557         // Find the beginning, find the end, and look for
01558         // the stuff in-between.
01559         TiXmlDocument* document = GetDocument();
01560         if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
01561         {
01562                 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
01563                 return 0;
01564         }
01565         if ( data )
01566         {
01567                 data->Stamp( p, _encoding );
01568                 location = data->Cursor();
01569         }
01570         p += 5;
01571 
01572         version = "";
01573         encoding = "";
01574         standalone = "";
01575 
01576         while ( p && *p )
01577         {
01578                 if ( *p == '>' )
01579                 {
01580                         ++p;
01581                         return p;
01582                 }
01583 
01584                 p = SkipWhiteSpace( p, _encoding );
01585                 if ( StringEqual( p, "version", true, _encoding ) )
01586                 {
01587                         TiXmlAttribute attrib;
01588                         p = attrib.Parse( p, data, _encoding );
01589                         version = attrib.Value();
01590                 }
01591                 else if ( StringEqual( p, "encoding", true, _encoding ) )
01592                 {
01593                         TiXmlAttribute attrib;
01594                         p = attrib.Parse( p, data, _encoding );
01595                         encoding = attrib.Value();
01596                 }
01597                 else if ( StringEqual( p, "standalone", true, _encoding ) )
01598                 {
01599                         TiXmlAttribute attrib;
01600                         p = attrib.Parse( p, data, _encoding );
01601                         standalone = attrib.Value();
01602                 }
01603                 else
01604                 {
01605                         // Read over whatever it is.
01606                         while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
01607                                 ++p;
01608                 }
01609         }
01610         return 0;
01611 }
01612 
01613 bool TiXmlText::Blank() const
01614 {
01615         for ( unsigned i=0; i<value.length(); i++ )
01616                 if ( !IsWhiteSpace( value[i] ) )
01617                         return false;
01618         return true;
01619 }
01620 
01621 }}