00001 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 00002 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 00003 // Distributed under the Boost Software License, Version 1.0. (See accompany- 00004 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 00005 00006 #ifndef BOOST_UTF8_CODECVT_FACET_HPP 00007 #define BOOST_UTF8_CODECVT_FACET_HPP 00008 00009 // MS compatible compilers support #pragma once 00010 #if defined(_MSC_VER) && (_MSC_VER >= 1020) 00011 # pragma once 00012 #endif 00013 00015 // utf8_codecvt_facet.hpp 00016 00017 // This header defines class utf8_codecvt_facet, derived fro 00018 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in 00019 // files into wchar_t strings in the application. 00020 // 00021 // The header is NOT STANDALONE, and is not to be included by the USER. 00022 // There are at least two libraries which want to use this functionality, and 00023 // we want to avoid code duplication. It would be possible to create utf8 00024 // library, but: 00025 // - this requires review process first 00026 // - in the case, when linking the a library which uses utf8 00027 // (say 'program_options'), user should also link to the utf8 library. 00028 // This seems inconvenient, and asking a user to link to an unrevieved 00029 // library is strange. 00030 // Until the above points are fixed, a library which wants to use utf8 must: 00031 // - include this header from one of it's headers or sources 00032 // - include the corresponding .cpp file from one of the sources 00033 // - before including either file, the library must define 00034 // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used 00035 // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace 00036 // - declaration. 00037 // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' 00038 // symbols. 00039 // 00040 // For example, program_options library might contain: 00041 // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> 00042 // namespace boost { namespace program_options { 00043 // #define BOOST_UTF8_END_NAMESPACE }} 00044 // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL 00045 // #include "../../detail/utf8/utf8_codecvt.cpp" 00046 // 00047 // Essentially, each library will have its own copy of utf8 code, in 00048 // different namespaces. 00049 00050 // Note:(Robert Ramey). I have made the following alterations in the original 00051 // code. 00052 // a) Rendered utf8_codecvt<wchar_t, char> with using templates 00053 // b) Move longer functions outside class definition to prevent inlining 00054 // and make code smaller 00055 // c) added on a derived class to permit translation to/from current 00056 // locale to utf8 00057 00058 // See http://www.boost.org for updates, documentation, and revision history. 00059 00060 // archives stored as text - note these ar templated on the basic 00061 // stream templates to accommodate wide (and other?) kind of characters 00062 // 00063 // note the fact that on libraries without wide characters, ostream is 00064 // is not a specialization of basic_ostream which in fact is not defined 00065 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather 00066 // use two template parameters 00067 // 00068 // utf8_codecvt_facet 00069 // This is an implementation of a std::codecvt facet for translating 00070 // from UTF-8 externally to UCS-4. Note that this is not tied to 00071 // any specific types in order to allow customization on platforms 00072 // where wchar_t is not big enough. 00073 // 00074 // NOTES: The current implementation jumps through some unpleasant hoops in 00075 // order to deal with signed character types. As a std::codecvt_base::result, 00076 // it is necessary for the ExternType to be convertible to unsigned char. 00077 // I chose not to tie the extern_type explicitly to char. But if any combination 00078 // of types other than <wchar_t,char_t> is used, then std::codecvt must be 00079 // specialized on those types for this to work. 00080 00081 #include <locale> 00082 // for mbstate_t 00083 #include <wchar.h> 00084 // for std::size_t 00085 #include <cstddef> 00086 00087 #include <boost/config.hpp> 00088 #include <boost/detail/workaround.hpp> 00089 00090 namespace std { 00091 #if defined(__LIBCOMO__) 00092 using ::mbstate_t; 00093 #elif defined(BOOST_DINKUMWARE_STDLIB) && !defined(__BORLANDC__) 00094 using ::mbstate_t; 00095 #elif defined(__SGI_STL_PORT) 00096 #elif defined(BOOST_NO_STDC_NAMESPACE) 00097 using ::mbstate_t; 00098 using ::codecvt; 00099 #endif 00100 } // namespace std 00101 00102 #if !defined(__MSL_CPP__) && !defined(__LIBCOMO__) 00103 #define BOOST_CODECVT_DO_LENGTH_CONST const 00104 #else 00105 #define BOOST_CODECVT_DO_LENGTH_CONST 00106 #endif 00107 00108 // maximum lenght of a multibyte string 00109 #define MB_LENGTH_MAX 8 00110 00111 BOOST_UTF8_BEGIN_NAMESPACE 00112 00113 struct BOOST_UTF8_DECL utf8_codecvt_facet : 00114 public std::codecvt<wchar_t, char, std::mbstate_t> 00115 { 00116 public: 00117 explicit utf8_codecvt_facet(std::size_t no_locale_manage=0) 00118 : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 00119 {} 00120 protected: 00121 virtual std::codecvt_base::result do_in( 00122 std::mbstate_t& state, 00123 const char * from, 00124 const char * from_end, 00125 const char * & from_next, 00126 wchar_t * to, 00127 wchar_t * to_end, 00128 wchar_t*& to_next 00129 ) const; 00130 00131 virtual std::codecvt_base::result do_out( 00132 std::mbstate_t & state, const wchar_t * from, 00133 const wchar_t * from_end, const wchar_t* & from_next, 00134 char * to, char * to_end, char * & to_next 00135 ) const; 00136 00137 bool invalid_continuing_octet(unsigned char octet_1) const { 00138 return (octet_1 < 0x80|| 0xbf< octet_1); 00139 } 00140 00141 bool invalid_leading_octet(unsigned char octet_1) const { 00142 return (0x7f < octet_1 && octet_1 < 0xc0) || 00143 (octet_1 > 0xfd); 00144 } 00145 00146 // continuing octets = octets except for the leading octet 00147 static unsigned int get_cont_octet_count(unsigned char lead_octet) { 00148 return get_octet_count(lead_octet) - 1; 00149 } 00150 00151 static unsigned int get_octet_count(unsigned char lead_octet); 00152 00153 // How many "continuing octets" will be needed for this word 00154 // == total octets - 1. 00155 int get_cont_octet_out_count(wchar_t word) const ; 00156 00157 virtual bool do_always_noconv() const throw() { return false; } 00158 00159 // UTF-8 isn't really stateful since we rewind on partial conversions 00160 virtual std::codecvt_base::result do_unshift( 00161 std::mbstate_t&, 00162 char * from, 00163 char * /*to*/, 00164 char * & next 00165 ) const 00166 { 00167 next = from; 00168 return ok; 00169 } 00170 00171 virtual int do_encoding() const throw() { 00172 const int variable_byte_external_encoding=0; 00173 return variable_byte_external_encoding; 00174 } 00175 00176 // How many char objects can I process to get <= max_limit 00177 // wchar_t objects? 00178 virtual int do_length( 00179 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, 00180 const char * from, 00181 const char * from_end, 00182 std::size_t max_limit 00183 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 00184 ) const throw(); 00185 #else 00186 ) const; 00187 #endif 00188 00189 // Largest possible value do_length(state,from,from_end,1) could return. 00190 virtual int do_max_length() const throw () { 00191 return 6; // largest UTF-8 encoding of a UCS-4 character 00192 } 00193 }; 00194 00195 BOOST_UTF8_END_NAMESPACE 00196 00197 #endif // BOOST_UTF8_CODECVT_FACET_HPP