appl: utf8_codecvt_facet.hpp Source File

Go to the documentation of this file.
00001 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
00002 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
00003 // Distributed under the Boost Software License, Version 1.0. (See accompany-
00004 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
00005 
00006 #ifndef BOOST_UTF8_CODECVT_FACET_HPP
00007 #define BOOST_UTF8_CODECVT_FACET_HPP
00008 
00009 // MS compatible compilers support #pragma once
00010 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
00011 # pragma once
00012 #endif
00013 
00015 // utf8_codecvt_facet.hpp
00016 
00017 // This header defines class utf8_codecvt_facet, derived fro 
00018 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
00019 // files into wchar_t strings in the application.
00020 //
00021 // The header is NOT STANDALONE, and is not to be included by the USER.
00022 // There are at least two libraries which want to use this functionality, and
00023 // we want to avoid code duplication. It would be possible to create utf8
00024 // library, but:
00025 // - this requires review process first
00026 // - in the case, when linking the a library which uses utf8 
00027 //   (say 'program_options'), user should also link to the utf8 library.
00028 //   This seems inconvenient, and asking a user to link to an unrevieved 
00029 //   library is strange. 
00030 // Until the above points are fixed, a library which wants to use utf8 must:
00031 // - include this header from one of it's headers or sources
00032 // - include the corresponding .cpp file from one of the sources
00033 // - before including either file, the library must define
00034 //   - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
00035 //   - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
00036 //   - declaration.
00037 //   - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
00038 //     symbols.
00039 //
00040 // For example, program_options library might contain:
00041 //    #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> 
00042 //             namespace boost { namespace program_options {
00043 //    #define BOOST_UTF8_END_NAMESPACE }}
00044 //    #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL
00045 //    #include "../../detail/utf8/utf8_codecvt.cpp"
00046 //
00047 // Essentially, each library will have its own copy of utf8 code, in
00048 // different namespaces. 
00049 
00050 // Note:(Robert Ramey).  I have made the following alterations in the original
00051 // code.
00052 // a) Rendered utf8_codecvt<wchar_t, char>  with using templates
00053 // b) Move longer functions outside class definition to prevent inlining
00054 // and make code smaller
00055 // c) added on a derived class to permit translation to/from current
00056 // locale to utf8
00057 
00058 //  See http://www.boost.org for updates, documentation, and revision history.
00059 
00060 // archives stored as text - note these ar templated on the basic
00061 // stream templates to accommodate wide (and other?) kind of characters
00062 //
00063 // note the fact that on libraries without wide characters, ostream is
00064 // is not a specialization of basic_ostream which in fact is not defined
00065 // in such cases.   So we can't use basic_ostream<OStream::char_type> but rather
00066 // use two template parameters
00067 //
00068 // utf8_codecvt_facet
00069 //   This is an implementation of a std::codecvt facet for translating 
00070 //   from UTF-8 externally to UCS-4.  Note that this is not tied to
00071 //   any specific types in order to allow customization on platforms
00072 //   where wchar_t is not big enough.
00073 //
00074 // NOTES:  The current implementation jumps through some unpleasant hoops in
00075 // order to deal with signed character types.  As a std::codecvt_base::result,
00076 // it is necessary  for the ExternType to be convertible to unsigned  char.
00077 // I chose not to tie the extern_type explicitly to char. But if any combination
00078 // of types other than <wchar_t,char_t> is used, then std::codecvt must be
00079 // specialized on those types for this to work.
00080 
00081 #include <locale>
00082 // for mbstate_t
00083 #include <wchar.h>
00084 // for std::size_t
00085 #include <cstddef>
00086 
00087 #include <boost/config.hpp>
00088 #include <boost/detail/workaround.hpp>
00089 
00090 namespace std {
00091     #if defined(__LIBCOMO__)
00092         using ::mbstate_t;
00093     #elif defined(BOOST_DINKUMWARE_STDLIB) && !defined(__BORLANDC__)
00094         using ::mbstate_t;
00095     #elif defined(__SGI_STL_PORT)
00096     #elif defined(BOOST_NO_STDC_NAMESPACE)
00097         using ::mbstate_t;
00098         using ::codecvt;
00099     #endif
00100 } // namespace std
00101 
00102 #if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
00103     #define BOOST_CODECVT_DO_LENGTH_CONST const
00104 #else
00105     #define BOOST_CODECVT_DO_LENGTH_CONST
00106 #endif
00107 
00108 // maximum lenght of a multibyte string
00109 #define MB_LENGTH_MAX 8
00110 
00111 BOOST_UTF8_BEGIN_NAMESPACE
00112 
00113 struct BOOST_UTF8_DECL utf8_codecvt_facet :
00114     public std::codecvt<wchar_t, char, std::mbstate_t>  
00115 {
00116 public:
00117     explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
00118         : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 
00119     {}
00120 protected:
00121     virtual std::codecvt_base::result do_in(
00122         std::mbstate_t& state, 
00123         const char * from,
00124         const char * from_end, 
00125         const char * & from_next,
00126         wchar_t * to, 
00127         wchar_t * to_end, 
00128         wchar_t*& to_next
00129     ) const;
00130 
00131     virtual std::codecvt_base::result do_out(
00132         std::mbstate_t & state, const wchar_t * from,
00133         const wchar_t * from_end, const wchar_t*  & from_next,
00134         char * to, char * to_end, char * & to_next
00135     ) const;
00136 
00137     bool invalid_continuing_octet(unsigned char octet_1) const {
00138         return (octet_1 < 0x80|| 0xbf< octet_1);
00139     }
00140 
00141     bool invalid_leading_octet(unsigned char octet_1)   const {
00142         return (0x7f < octet_1 && octet_1 < 0xc0) ||
00143             (octet_1 > 0xfd);
00144     }
00145 
00146     // continuing octets = octets except for the leading octet
00147     static unsigned int get_cont_octet_count(unsigned   char lead_octet) {
00148         return get_octet_count(lead_octet) - 1;
00149     }
00150 
00151     static unsigned int get_octet_count(unsigned char   lead_octet);
00152 
00153     // How many "continuing octets" will be needed for this word
00154     // ==   total octets - 1.
00155     int get_cont_octet_out_count(wchar_t word) const ;
00156 
00157     virtual bool do_always_noconv() const throw() { return false; }
00158 
00159     // UTF-8 isn't really stateful since we rewind on partial conversions
00160     virtual std::codecvt_base::result do_unshift(
00161         std::mbstate_t&,
00162         char * from,
00163         char * /*to*/,
00164         char * & next
00165     ) const 
00166     {
00167         next = from;
00168         return ok;
00169     }
00170 
00171     virtual int do_encoding() const throw() {
00172         const int variable_byte_external_encoding=0;
00173         return variable_byte_external_encoding;
00174     }
00175 
00176     // How many char objects can I process to get <= max_limit
00177     // wchar_t objects?
00178     virtual int do_length(
00179         BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
00180         const char * from,
00181         const char * from_end, 
00182         std::size_t max_limit
00183 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
00184         ) const throw();
00185 #else
00186         ) const;
00187 #endif
00188 
00189     // Largest possible value do_length(state,from,from_end,1) could return.
00190     virtual int do_max_length() const throw () {
00191         return 6; // largest UTF-8 encoding of a UCS-4 character
00192     }
00193 };
00194 
00195 BOOST_UTF8_END_NAMESPACE
00196 
00197 #endif // BOOST_UTF8_CODECVT_FACET_HPP