cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <cstring>
12 #include <locale>
13 #include <iomanip>
14 #include <sstream>
15 #include <cstdint>
16 
17 #ifdef _WIN32
18 #include <windows.h>
19 #endif
20 
24 {
25  uint32_t i=1;
26  return reinterpret_cast<uint8_t &>(i);
27 }
28 
29 #define BUFSIZE 100
30 
31 std::string narrow(const wchar_t *s)
32 {
33  #ifdef _WIN32
34 
35  int slength=static_cast<int>(wcslen(s));
36  int rlength=
37  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
38  std::string r(rlength, 0);
39  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
40  return r;
41 
42  #else
43  // dummy conversion
44  std::string r;
45  r.reserve(wcslen(s));
46  while(*s!=0)
47  {
48  r+=static_cast<char>(*s);
49  s++;
50  }
51 
52  return r;
53  #endif
54 }
55 
56 std::wstring widen(const char *s)
57 {
58  #ifdef _WIN32
59 
60  int slength=static_cast<int>(strlen(s));
61  int rlength=
62  MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
63  std::wstring r(rlength, 0);
64  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
65  return r;
66 
67  #else
68  // dummy conversion
69  std::wstring r;
70  r.reserve(strlen(s));
71  while(*s!=0)
72  {
73  r+=wchar_t(*s);
74  s++;
75  }
76 
77  return r;
78  #endif
79 }
80 
81 std::string narrow(const std::wstring &s)
82 {
83  #ifdef _WIN32
84 
85  int slength=static_cast<int>(s.size());
86  int rlength=
87  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
88  std::string r(rlength, 0);
89  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
90  return r;
91 
92  #else
93  // dummy conversion
94  return std::string(s.begin(), s.end());
95  #endif
96 }
97 
98 std::wstring widen(const std::string &s)
99 {
100  #ifdef _WIN32
101 
102  int slength=static_cast<int>(s.size());
103  int rlength=
104  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
105  std::wstring r(rlength, 0);
106  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
107  return r;
108 
109  #else
110  // dummy conversion
111  return std::wstring(s.begin(), s.end());
112  #endif
113 }
114 
117 static void utf8_append_code(unsigned int c, std::string &result)
118 {
119  if(c<=0x7f)
120  result+=static_cast<char>(c);
121  else if(c<=0x7ff)
122  {
123  result+=static_cast<char>((c >> 6) | 0xc0);
124  result+=static_cast<char>((c &0x3f) | 0x80);
125  }
126  else if(c<=0xffff)
127  {
128  result+=static_cast<char>((c >> 12) | 0xe0);
129  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
130  result+=static_cast<char>((c &0x3f) | 0x80);
131  }
132  else
133  {
134  result+=static_cast<char>((c >> 18) | 0xf0);
135  result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
136  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
137  result+=static_cast<char>((c &0x3f) | 0x80);
138  }
139 }
140 
143 std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
144 {
145  std::string result;
146 
147  result.reserve(s.size()); // at least that long
148 
149  for(const auto c : s)
150  utf8_append_code(c, result);
151 
152  return result;
153 }
154 
155 const char **narrow_argv(int argc, const wchar_t **argv_wide)
156 {
157  if(argv_wide==nullptr)
158  return nullptr;
159 
160  // the following never gets deleted
161  const char **argv_narrow=new const char *[argc+1];
162  argv_narrow[argc]=nullptr;
163 
164  for(int i=0; i<argc; i++)
165  argv_narrow[i]=strdup(narrow(argv_wide[i]).c_str());
166 
167  return argv_narrow;
168 }
169 
173 uint16_t do_swap_bytes(uint16_t x)
174 {
175  uint16_t b1=x & 0xFF;
176  uint16_t b2=x & 0xFF00;
177  return (b1 << 8) | (b2 >> 8);
178 }
179 
180 
181 void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
182 {
183  // we do not treat 0xD800 to 0xDFFF, although
184  // they are not valid unicode symbols
185 
186  if(code<0xFFFF)
187  { // code is encoded as one UTF16 character
188  // we just take the code and possibly swap the bytes
189  unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
190  result+=static_cast<wchar_t>(a);
191  }
192  else // code is encoded as two UTF16 characters
193  {
194  // if this is valid unicode, we have
195  // code<0x10FFFF
196  // but let's not check it programmatically
197 
198  // encode the code in UTF16, possibly swapping bytes.
199  code=code-0x10000;
200  unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
201  unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
202  result+=static_cast<wchar_t>(a1);
203  unsigned int i2=(code & 0x3ff) | 0xDC00;
204  unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
205  result+=static_cast<wchar_t>(a2);
206  }
207 }
208 
209 
214 std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
215 {
216  std::wstring result;
217  result.reserve(in.size());
219  while(i<in.size())
220  {
221  unsigned char c=in[i++];
222  unsigned int code=0;
223  // the ifs that follow find out how many UTF8 characters (1-4) store the
224  // next unicode character. This is determined by the few most
225  // significant bits.
226  if(c<=0x7F)
227  {
228  // if it's one character, then code is exactly the value
229  code=c;
230  }
231  else if(c<=0xDF && i<in.size())
232  { // in other cases, we need to read the right number of chars and decode
233  // note: if we wanted to make sure that we capture incorrect strings,
234  // we should check that whatever follows first character starts with
235  // bits 10.
236  code=(c & 0x1F) << 6;
237  c=in[i++];
238  code+=c & 0x3F;
239  }
240  else if(c<=0xEF && i+1<in.size())
241  {
242  code=(c & 0xF) << 12;
243  c=in[i++];
244  code+=(c & 0x3F) << 6;
245  c=in[i++];
246  code+=c & 0x3F;
247  }
248  else if(c<=0xF7 && i+2<in.size())
249  {
250  code=(c & 0x7) << 18;
251  c=in[i++];
252  code+=(c & 0x3F) << 12;
253  c=in[i++];
254  code+=(c & 0x3F) << 6;
255  c=in[i++];
256  code+=c & 0x3F;
257  }
258  else
259  {
260  // The string is not a valid UTF8 string! Either it has some characters
261  // missing from a multi-character unicode symbol, or it has a char with
262  // too high value.
263  // For now, let's replace the character with a space
264  code=32;
265  }
266 
267  utf16_append_code(code, swap_bytes, result);
268  }
269 
270  return result;
271 }
272 
275 std::wstring utf8_to_utf16_big_endian(const std::string& in)
276 {
277  bool swap_bytes=is_little_endian_arch();
278  return utf8_to_utf16(in, swap_bytes);
279 }
280 
283 std::wstring utf8_to_utf16_little_endian(const std::string& in)
284 {
285  bool swap_bytes=!is_little_endian_arch();
286  return utf8_to_utf16(in, swap_bytes);
287 }
288 
291 std::string utf16_little_endian_to_ascii(const std::wstring& in)
292 {
293  std::ostringstream result;
294  std::locale loc;
295  for(const auto c : in)
296  {
297  if(c<=255 && isprint(c, loc))
298  result << (unsigned char)c;
299  else
300  {
301  result << "\\u"
302  << std::hex
303  << std::setw(4)
304  << std::setfill('0')
305  << (unsigned int)c;
306  }
307  }
308  return result.str();
309 }
#define loc()
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:31
static int8_t r
Definition: irep_hash.h:59
std::wstring widen(const char *s)
Definition: unicode.cpp:56
const char ** narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:155
uint16_t do_swap_bytes(uint16_t x)
A helper function for dealing with different UTF16 endians.
Definition: unicode.cpp:173
std::string utf32_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:143
std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes)
Definition: unicode.cpp:214
unsignedbv_typet size_type()
Definition: c_types.cpp:57
bool is_little_endian_arch()
Determine endianness of the architecture.
Definition: unicode.cpp:23
std::string utf16_little_endian_to_ascii(const std::wstring &in)
Definition: unicode.cpp:291
static void utf8_append_code(unsigned int c, std::string &result)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:117
std::wstring utf8_to_utf16_little_endian(const std::string &in)
Definition: unicode.cpp:283
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
Definition: unicode.cpp:181
std::wstring utf8_to_utf16_big_endian(const std::string &in)
Definition: unicode.cpp:275