00001 // Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 00002 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). Permission to copy, 00003 // use, modify, sell and distribute this software is granted provided this 00004 // copyright notice appears in all copies. This software is provided "as is" 00005 // without express or implied warranty, and with no claim as to its suitability 00006 // for any purpose. 00007 00008 #ifndef BOOST_UTF8_CODECVT_FACET_HPP 00009 #define BOOST_UTF8_CODECVT_FACET_HPP 00010 00011 // MS compatible compilers support #pragma once 00012 #if defined(_MSC_VER) && (_MSC_VER >= 1020) 00013 # pragma once 00014 #endif 00015 00016 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 00017 // utf8_codecvt_facet.hpp 00018 00019 // This header defines class utf8_codecvt_facet, derived fro 00020 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in 00021 // files into wchar_t strings in the application. 00022 // 00023 // The header is NOT STANDALONE, and is not to be included by the USER. 00024 // There are at least two libraries which want to use this functionality, and 00025 // we want to avoid code duplication. It would be possible to create utf8 00026 // library, but: 00027 // - this requires review process first 00028 // - in the case, when linking the a library which uses utf8 00029 // (say 'program_options'), user should also link to the utf8 library. 00030 // This seems inconvenient, and asking a user to link to an unrevieved 00031 // library is strange. 00032 // Until the above points are fixed, a library which wants to use utf8 must: 00033 // - include this header from one of it's headers or sources 00034 // - include the corresponding .cpp file from one of the sources 00035 // - before including either file, the library must define 00036 // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used 00037 // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace 00038 // - declaration. 00039 // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' 00040 // symbols. 00041 // 00042 // For example, program_options library might contain: 00043 // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> 00044 // namespace boost { namespace program_options { 00045 // #define BOOST_UTF8_END_NAMESPACE }} 00046 // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL 00047 // #include "../../detail/utf8/utf8_codecvt.cpp" 00048 // 00049 // Essentially, each library will have its own copy of utf8 code, in 00050 // different namespaces. 00051 00052 // Note:(Robert Ramey). I have made the following alterations in the original 00053 // code. 00054 // a) Rendered utf8_codecvt<wchar_t, char> with using templates 00055 // b) Move longer functions outside class definition to prevent inlining 00056 // and make code smaller 00057 // c) added on a derived class to permit translation to/from current 00058 // locale to utf8 00059 00060 // See http://www.boost.org for updates, documentation, and revision history. 00061 00062 // archives stored as text - note these ar templated on the basic 00063 // stream templates to accommodate wide (and other?) kind of characters 00064 // 00065 // note the fact that on libraries without wide characters, ostream is 00066 // is not a specialization of basic_ostream which in fact is not defined 00067 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather 00068 // use two template parameters 00069 // 00070 // utf8_codecvt_facet 00071 // This is an implementation of a std::codecvt facet for translating 00072 // from UTF-8 externally to UCS-4. Note that this is not tied to 00073 // any specific types in order to allow customization on platforms 00074 // where wchar_t is not big enough. 00075 // 00076 // NOTES: The current implementation jumps through some unpleasant hoops in 00077 // order to deal with signed character types. As a std::codecvt_base::result, 00078 // it is necessary for the ExternType to be convertible to unsigned char. 00079 // I chose not to tie the extern_type explicitly to char. But if any combination 00080 // of types other than <wchar_t,char_t> is used, then std::codecvt must be 00081 // specialized on those types for this to work. 00082 00083 #include <locale> 00084 // for mbstate_t 00085 #include <wchar.h> 00086 // for std::size_t 00087 #include <cstddef> 00088 00089 #include <sysc/packages/boost/config.hpp> 00090 #include <sysc/packages/boost/detail/workaround.hpp> 00091 00092 namespace std { 00093 #if defined(__LIBCOMO__) 00094 using ::mbstate_t; 00095 #elif defined(BOOST_DINKUMWARE_STDLIB) 00096 using ::mbstate_t; 00097 #elif defined(__SGI_STL_PORT) 00098 #elif defined(BOOST_NO_STDC_NAMESPACE) 00099 using ::mbstate_t; 00100 using ::codecvt; 00101 #endif 00102 } // namespace std 00103 00104 #if !defined(__MSL_CPP__) && !defined(__LIBCOMO__) 00105 #define BOOST_CODECVT_DO_LENGTH_CONST const 00106 #else 00107 #define BOOST_CODECVT_DO_LENGTH_CONST 00108 #endif 00109 00110 // maximum lenght of a multibyte string 00111 #define MB_LENGTH_MAX 8 00112 00113 BOOST_UTF8_BEGIN_NAMESPACE 00114 00115 struct BOOST_UTF8_DECL utf8_codecvt_facet : 00116 public std::codecvt<wchar_t, char, std::mbstate_t> 00117 { 00118 public: 00119 explicit utf8_codecvt_facet(std::size_t no_locale_manage=0) 00120 : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 00121 {} 00122 protected: 00123 virtual std::codecvt_base::result do_in( 00124 std::mbstate_t& state, 00125 const char * from, 00126 const char * from_end, 00127 const char * & from_next, 00128 wchar_t * to, 00129 wchar_t * to_end, 00130 wchar_t*& to_next 00131 ) const; 00132 00133 virtual std::codecvt_base::result do_out( 00134 std::mbstate_t & state, const wchar_t * from, 00135 const wchar_t * from_end, const wchar_t* & from_next, 00136 char * to, char * to_end, char * & to_next 00137 ) const; 00138 00139 bool invalid_continuing_octet(unsigned char octet_1) const { 00140 return (octet_1 < 0x80|| 0xbf< octet_1); 00141 } 00142 00143 bool invalid_leading_octet(unsigned char octet_1) const { 00144 return (0x7f < octet_1 && octet_1 < 0xc0) || 00145 (octet_1 > 0xfd); 00146 } 00147 00148 // continuing octets = octets except for the leading octet 00149 static unsigned int get_cont_octet_count(unsigned char lead_octet) { 00150 return get_octet_count(lead_octet) - 1; 00151 } 00152 00153 static unsigned int get_octet_count(unsigned char lead_octet); 00154 00155 // How many "continuing octets" will be needed for this word 00156 // == total octets - 1. 00157 int get_cont_octet_out_count(wchar_t word) const ; 00158 00159 virtual bool do_always_noconv() const throw() { return false; } 00160 00161 // UTF-8 isn't really stateful since we rewind on partial conversions 00162 virtual std::codecvt_base::result do_unshift( 00163 std::mbstate_t&, 00164 char * from, 00165 char * to, 00166 char * & next 00167 ) const 00168 { 00169 next = from; 00170 return ok; 00171 } 00172 00173 virtual int do_encoding() const throw() { 00174 const int variable_byte_external_encoding=0; 00175 return variable_byte_external_encoding; 00176 } 00177 00178 // How many char objects can I process to get <= max_limit 00179 // wchar_t objects? 00180 virtual int do_length( 00181 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, 00182 const char * from, 00183 const char * from_end, 00184 std::size_t max_limit 00185 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 00186 ) const throw(); 00187 #else 00188 ) const; 00189 #endif 00190 00191 // Largest possible value do_length(state,from,from_end,1) could return. 00192 virtual int do_max_length() const throw () { 00193 return 6; // largest UTF-8 encoding of a UCS-4 character 00194 } 00195 }; 00196 00197 BOOST_UTF8_END_NAMESPACE 00198 00199 #endif // BOOST_UTF8_CODECVT_FACET_HPP
1.5.5