cprover
string_constraint_generator_format.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module: Generates string constraints for the Java format function
4 
5 Author: Romain Brenguier
6 
7 Date: May 2017
8 
9 \*******************************************************************/
10 
13 
14 #include <iomanip>
15 #include <string>
16 #include <regex>
17 #include <vector>
18 
19 #include <util/std_expr.h>
20 #include <util/unicode.h>
21 
24 
25 // Format specifier describes how a value should be printed.
27 {
28 public:
29  // Constants describing the meaning of characters in format specifiers.
30  static const char DECIMAL_INTEGER ='d';
31  static const char OCTAL_INTEGER ='o';
32  static const char HEXADECIMAL_INTEGER ='x';
33  static const char HEXADECIMAL_INTEGER_UPPER='X';
34  static const char SCIENTIFIC ='e';
35  static const char SCIENTIFIC_UPPER ='E';
36  static const char GENERAL ='g';
37  static const char GENERAL_UPPER ='G';
38  static const char DECIMAL_FLOAT ='f';
39  static const char HEXADECIMAL_FLOAT ='a';
40  static const char HEXADECIMAL_FLOAT_UPPER ='A';
41  static const char CHARACTER ='c';
42  static const char CHARACTER_UPPER ='C';
43  static const char DATE_TIME ='t';
44  static const char DATE_TIME_UPPER ='T';
45  static const char BOOLEAN ='b';
46  static const char BOOLEAN_UPPER ='B';
47  static const char STRING ='s';
48  static const char STRING_UPPER ='S';
49  static const char HASHCODE ='h';
50  static const char HASHCODE_UPPER ='H';
51  static const char LINE_SEPARATOR ='n';
52  static const char PERCENT_SIGN ='%';
53 
54  int index=-1;
55  std::string flag;
56  int width;
57  int precision;
58  bool dt=false;
59  char conversion;
60 
62  int _index,
63  std::string _flag,
64  int _width,
65  int _precision,
66  bool _dt,
67  char c):
68  index(_index),
69  flag(_flag),
70  width(_width),
71  precision(_precision),
72  dt(_dt),
73  conversion(c)
74  { }
75 };
76 
77 // Format text represent a constant part of a format string.
79 {
80 public:
81  explicit format_textt(std::string _content): content(_content) { }
82 
84 
85  std::string get_content() const
86  {
87  return content;
88  }
89 
90 private:
91  std::string content;
92 };
93 
94 // A format element is either a specifier or text.
96 {
97 public:
98  typedef enum {SPECIFIER, TEXT} format_typet;
99 
100  explicit format_elementt(format_typet _type): type(_type), fstring("")
101  {
102  }
103 
104  explicit format_elementt(std::string s): type(TEXT), fstring(s)
105  {
106  }
107 
109  {
110  fspec.push_back(fs);
111  }
112 
113  bool is_format_specifier() const
114  {
115  return type==SPECIFIER;
116  }
117 
118  bool is_format_text() const
119  {
120  return type==TEXT;
121  }
122 
124  {
126  return fspec.back();
127  }
128 
130  {
132  return fstring;
133  }
134 
136  {
138  return fstring;
139  }
140 
141 private:
144  std::vector<format_specifiert> fspec;
145 };
146 
147 #if 0
148 // This code is deactivated as it is not used for now, but ultimalety this
149 // should be used to throw an exception when the format string is not correct
154 static bool check_format_string(std::string s)
155 {
156  std::string format_specifier=
157  "%(\\d+\\$)?([-#+ 0,(\\<]*)?(\\d+)?(\\.\\d+)?([tT])?([a-zA-Z%])";
158  std::regex regex(format_specifier);
159  std::smatch match;
160 
161  while(std::regex_search(s, match, regex))
162  {
163  if(match.position()!= 0)
164  for(const auto &c : match.str())
165  if(c=='%')
166  return false;
167  s=match.suffix();
168  }
169 
170  for(const auto &c : s)
171  if(c=='%')
172  return false;
173 
174  return true;
175 }
176 #endif
177 
186 {
187  int index=m[1].str().empty()?-1:std::stoi(m[1].str());
188  std::string flag=m[2].str().empty()?"":m[2].str();
189  int width=m[3].str().empty()?-1:std::stoi(m[3].str());
190  int precision=m[4].str().empty()?-1:std::stoi(m[4].str());
191  std::string tT=m[5].str();
192 
193  bool dt=(tT!="");
194  if(tT=="T")
195  flag.push_back(format_specifiert::DATE_TIME_UPPER);
196 
197  INVARIANT(
198  m[6].str().length()==1, "format conversion should be one character");
199  char conversion=m[6].str()[0];
200 
201  return format_specifiert(index, flag, width, precision, dt, conversion);
202 }
203 
209 static std::vector<format_elementt> parse_format_string(std::string s)
210 {
211  std::string format_specifier=
212  "%(\\d+\\$)?([-#+ 0,(\\<]*)?(\\d+)?(\\.\\d+)?([tT])?([a-zA-Z%])";
213  std::regex regex(format_specifier);
214  std::vector<format_elementt> al;
215  std::smatch match;
216 
217  while(std::regex_search(s, match, regex))
218  {
219  if(match.position()!=0)
220  {
221  std::string pre_match=s.substr(0, match.position());
222  al.emplace_back(pre_match);
223  }
224 
225  al.emplace_back(format_specifier_of_match(match));
226  s=match.suffix();
227  }
228 
229  al.emplace_back(s);
230  return al;
231 }
232 
238  const struct_exprt &expr, irep_idt component_name)
239 {
240  const struct_typet &type=to_struct_type(expr.type());
241  std::size_t number=type.component_number(component_name);
242  return expr.operands()[number];
243 }
244 
259 static std::pair<array_string_exprt, string_constraintst>
261  symbol_generatort &fresh_symbol,
262  const format_specifiert &fs,
263  const struct_exprt &arg,
264  const typet &index_type,
265  const typet &char_type,
266  array_poolt &array_pool,
267  const messaget &message,
268  const namespacet &ns)
269 {
270  string_constraintst constraints;
271  const array_string_exprt res = array_pool.fresh_string(index_type, char_type);
272  std::pair<exprt, string_constraintst> return_code;
273  switch(fs.conversion)
274  {
276  return_code = add_axioms_for_string_of_int(
277  res, get_component_in_struct(arg, ID_int), 0, ns);
278  return {res, std::move(return_code.second)};
280  return_code =
282  return {res, std::move(return_code.second)};
285  fresh_symbol,
286  res,
287  get_component_in_struct(arg, ID_float),
288  array_pool,
289  ns);
290  return {res, std::move(return_code.second)};
292  return_code = add_axioms_for_string_of_float(
293  fresh_symbol,
294  res,
295  get_component_in_struct(arg, ID_float),
296  array_pool,
297  ns);
298  return {res, std::move(return_code.second)};
300  return_code =
301  add_axioms_from_char(res, get_component_in_struct(arg, ID_char));
302  return {res, std::move(return_code.second)};
304  return_code =
305  add_axioms_from_bool(res, get_component_in_struct(arg, ID_boolean));
306  return {res, std::move(return_code.second)};
308  return {
309  get_string_expr(array_pool, get_component_in_struct(arg, "string_expr")),
310  {}};
312  return_code = add_axioms_for_string_of_int(
313  res, get_component_in_struct(arg, "hashcode"), 0, ns);
314  return {res, std::move(return_code.second)};
316  // TODO: the constant should depend on the system: System.lineSeparator()
317  return_code = add_axioms_for_constant(res, "\n");
318  return {res, std::move(return_code.second)};
320  return_code = add_axioms_for_constant(res, "%");
321  return {res, std::move(return_code.second)};
330  {
331  format_specifiert fs_lower = fs;
332  fs_lower.conversion=tolower(fs.conversion);
333  auto format_specifier_result = add_axioms_for_format_specifier(
334  fresh_symbol,
335  fs_lower,
336  arg,
337  index_type,
338  char_type,
339  array_pool,
340  message,
341  ns);
342 
343  const exprt return_code_upper_case =
344  fresh_symbol("return_code_upper_case", get_return_code_type());
345  const string_to_upper_case_builtin_functiont upper_case_function(
346  return_code_upper_case, res, format_specifier_result.first);
347  auto upper_case_constraints = upper_case_function.constraints(fresh_symbol);
348  merge(upper_case_constraints, std::move(format_specifier_result.second));
349  return {res, std::move(upper_case_constraints)};
350  }
359  // For all these unimplemented cases we return a non-deterministic string
360  message.warning() << "unimplemented format specifier: " << fs.conversion
361  << message.eom;
362  return {array_pool.fresh_string(index_type, char_type), {}};
363  default:
364  message.error() << "invalid format specifier: " << fs.conversion
365  << message.eom;
366  INVARIANT(
367  false, "format specifier must belong to [bBhHsScCdoxXeEfgGaAtT%n]");
368  }
369 }
370 
381 std::pair<exprt, string_constraintst> add_axioms_for_format(
382  symbol_generatort &fresh_symbol,
383  const array_string_exprt &res,
384  const std::string &s,
385  const exprt::operandst &args,
386  array_poolt &array_pool,
387  const messaget &message,
388  const namespacet &ns)
389 {
390  string_constraintst constraints;
391  const std::vector<format_elementt> format_strings=parse_format_string(s);
392  std::vector<array_string_exprt> intermediary_strings;
393  std::size_t arg_count=0;
394  const typet &char_type = res.content().type().subtype();
395  const typet &index_type = res.length().type();
396 
397  for(const format_elementt &fe : format_strings)
398  {
399  if(fe.is_format_specifier())
400  {
401  const format_specifiert &fs=fe.get_format_specifier();
402  struct_exprt arg;
405  {
406  if(fs.index==-1)
407  {
408  INVARIANT(
409  arg_count<args.size(), "number of format must match specifiers");
410  arg=to_struct_expr(args[arg_count++]);
411  }
412  else
413  {
414  INVARIANT(fs.index > 0, "index in format should be positive");
415  INVARIANT(
416  static_cast<std::size_t>(fs.index)<=args.size(),
417  "number of format must match specifiers");
418 
419  // first argument `args[0]` corresponds to index 1
420  arg=to_struct_expr(args[fs.index-1]);
421  }
422  }
423  auto result = add_axioms_for_format_specifier(
424  fresh_symbol, fs, arg, index_type, char_type, array_pool, message, ns);
425  merge(constraints, std::move(result.second));
426  intermediary_strings.push_back(result.first);
427  }
428  else
429  {
430  const array_string_exprt str =
431  array_pool.fresh_string(index_type, char_type);
432  auto result =
433  add_axioms_for_constant(str, fe.get_format_text().get_content());
434  merge(constraints, result.second);
435  intermediary_strings.push_back(str);
436  }
437  }
438 
439  exprt return_code = from_integer(0, get_return_code_type());
440 
441  if(intermediary_strings.empty())
442  {
443  constraints.existential.push_back(
445  return {return_code, constraints};
446  }
447 
448  array_string_exprt str = intermediary_strings[0];
449 
450  if(intermediary_strings.size() == 1)
451  {
452  // Copy the first string
453  auto result = add_axioms_for_substring(
454  fresh_symbol, res, str, from_integer(0, index_type), str.length());
455  merge(constraints, std::move(result.second));
456  return {result.first, std::move(constraints)};
457  }
458 
459  // start after the first string and stop before the last
460  for(std::size_t i = 1; i < intermediary_strings.size() - 1; ++i)
461  {
462  const array_string_exprt &intermediary = intermediary_strings[i];
463  const array_string_exprt fresh =
464  array_pool.fresh_string(index_type, char_type);
465  auto result = add_axioms_for_concat(fresh_symbol, fresh, str, intermediary);
466  return_code = maximum(return_code, result.first);
467  merge(constraints, std::move(result.second));
468  str = fresh;
469  }
470 
471  auto result =
472  add_axioms_for_concat(fresh_symbol, res, str, intermediary_strings.back());
473  merge(constraints, std::move(result.second));
474  return {maximum(result.first, return_code), std::move(constraints)};
475 }
476 
482 std::string
483 utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
484 {
485  for(const auto &op : arr.operands())
486  PRECONDITION(op.id() == ID_constant);
487 
488  std::wstring out(length, '?');
489 
490  for(std::size_t i=0; i<arr.operands().size() && i<length; i++)
491  out[i] = numeric_cast_v<unsigned>(to_constant_expr(arr.operands()[i]));
492 
493  return utf16_native_endian_to_java(out);
494 }
495 
510 std::pair<exprt, string_constraintst> add_axioms_for_format(
511  symbol_generatort &fresh_symbol,
513  array_poolt &array_pool,
514  const messaget &message,
515  const namespacet &ns)
516 {
517  PRECONDITION(f.arguments().size() >= 3);
518  const array_string_exprt res =
519  char_array_of_pointer(array_pool, f.arguments()[1], f.arguments()[0]);
520  const array_string_exprt s1 = get_string_expr(array_pool, f.arguments()[2]);
521 
522  if(s1.length().id() == ID_constant && s1.content().id() == ID_array)
523  {
524  const auto length =
525  numeric_cast_v<std::size_t>(to_constant_expr(s1.length()));
526  std::string s=utf16_constant_array_to_java(
527  to_array_expr(s1.content()), length);
528  // List of arguments after s
529  std::vector<exprt> args(f.arguments().begin() + 3, f.arguments().end());
530  return add_axioms_for_format(
531  fresh_symbol, res, s, args, array_pool, message, ns);
532  }
533  else
534  {
535  message.warning()
536  << "ignoring format function with non constant first argument"
537  << message.eom;
538  return {from_integer(1, f.type()), {}};
539  }
540 }
std::pair< exprt, string_constraintst > add_axioms_for_concat(symbol_generatort &fresh_symbol, const array_string_exprt &res, const array_string_exprt &s1, const array_string_exprt &s2)
Add axioms enforcing that res is equal to the concatenation of s1 and s2.
The type of an expression, extends irept.
Definition: type.h:27
std::pair< exprt, string_constraintst > add_axioms_for_format(symbol_generatort &fresh_symbol, const array_string_exprt &res, const std::string &s, const exprt::operandst &args, array_poolt &array_pool, const messaget &message, const namespacet &ns)
Parse s and add axioms ensuring the output corresponds to the output of String.format.
Generates string constraints to link results from string functions with their arguments.
Application of (mathematical) function.
Definition: std_expr.h:4481
std::vector< format_specifiert > fspec
std::string utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
Construct a string from a constant array.
array_string_exprt fresh_string(const typet &index_type, const typet &char_type)
construct a string expression whose length and content are new variables
static std::vector< format_elementt > parse_format_string(std::string s)
Parse the given string into format specifiers and text.
std::pair< exprt, string_constraintst > add_axioms_for_string_of_float(symbol_generatort &fresh_symbol, const function_application_exprt &f, array_poolt &array_pool, const namespacet &ns)
String representation of a float value.
const array_exprt & to_array_expr(const exprt &expr)
Cast an exprt to an array_exprt.
Definition: std_expr.h:1759
format_specifiert(int _index, std::string _flag, int _width, int _precision, bool _dt, char c)
Generation of fresh symbols of a given type.
argumentst & arguments()
Definition: std_expr.h:4506
std::pair< exprt, string_constraintst > add_axioms_from_char(const function_application_exprt &f, array_poolt &array_pool)
Conversion from char to string.
typet & type()
Return the type of the expression.
Definition: expr.h:68
signedbv_typet get_return_code_type()
void merge(string_constraintst &result, string_constraintst other)
Merge two sets of constraints by appending to the first one.
string_constraintst constraints(class symbol_generatort &fresh_symbol) const
Set of constraints ensuring result corresponds to input in which lowercase characters of Basic Latin ...
Structure type, corresponds to C style structs.
Definition: std_types.h:276
std::pair< exprt, string_constraintst > add_axioms_from_bool(const function_application_exprt &f, array_poolt &array_pool)
Add axioms corresponding to the String.valueOf(Z) java function.
Correspondance between arrays and pointers string representations.
Collection of constraints of different types: existential formulas, universal formulas, and "not contains" (universal with one alternation).
format_textt(std::string _content)
#define INVARIANT(CONDITION, REASON)
This macro uses the wrapper function &#39;invariant_violated_string&#39;.
Definition: invariant.h:400
mstreamt & warning() const
Definition: message.h:391
Equality.
Definition: std_expr.h:1484
std::pair< exprt, string_constraintst > add_axioms_for_substring(symbol_generatort &fresh_symbol, const array_string_exprt &res, const array_string_exprt &str, const exprt &start, const exprt &end)
Add axioms ensuring that res corresponds to the substring of str between indexes ‘start’ = max(st...
array_string_exprt get_string_expr(array_poolt &pool, const exprt &expr)
casts an expression to a string expression, or fetches the actual string_exprt in the case of a symbo...
exprt & content()
Definition: string_expr.h:80
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Definition: unicode.cpp:259
format_textt(const format_textt &fs)
std::pair< exprt, string_constraintst > add_axioms_for_string_of_int(const array_string_exprt &res, const exprt &input_int, size_t max_size, const namespacet &ns)
Add axioms enforcing that the string corresponds to the result of String.valueOf(I) or String...
API to expression classes.
mstreamt & error() const
Definition: message.h:386
A namespacet is essentially one or two symbol tables bound together, to allow for symbol lookups in t...
Definition: namespace.h:93
#define PRECONDITION(CONDITION)
Definition: invariant.h:438
std::pair< exprt, string_constraintst > add_axioms_for_constant(const array_string_exprt &res, irep_idt sval, const exprt &guard=true_exprt())
Add axioms ensuring that the provided string expression and constant are equal.
Class that provides messages with a built-in verbosity &#39;level&#39;.
Definition: message.h:144
bitvector_typet index_type()
Definition: c_types.cpp:16
const struct_typet & to_struct_type(const typet &type)
Cast a typet to a struct_typet.
Definition: std_types.h:349
dstringt has one field, an unsigned integer no which is an index into a static table of strings...
Definition: dstring.h:35
const struct_exprt & to_struct_expr(const exprt &expr)
Cast an exprt to a struct_exprt.
Definition: std_expr.h:1942
const constant_exprt & to_constant_expr(const exprt &expr)
Cast an exprt to a constant_exprt.
Definition: std_expr.h:4423
static exprt get_component_in_struct(const struct_exprt &expr, irep_idt component_name)
Helper for add_axioms_for_format_specifier.
static std::pair< array_string_exprt, string_constraintst > add_axioms_for_format_specifier(symbol_generatort &fresh_symbol, const format_specifiert &fs, const struct_exprt &arg, const typet &index_type, const typet &char_type, array_poolt &array_pool, const messaget &message, const namespacet &ns)
Parse s and add axioms ensuring the output corresponds to the output of String.format.
std::vector< exprt > operandst
Definition: expr.h:57
exprt maximum(const exprt &a, const exprt &b)
std::vector< exprt > existential
static eomt eom
Definition: message.h:284
exprt & length()
Definition: string_expr.h:70
static format_specifiert format_specifier_of_match(std::smatch &m)
Helper function for parsing format strings.
format_specifiert get_format_specifier() const
std::pair< exprt, string_constraintst > add_axioms_from_int_hex(const array_string_exprt &res, const exprt &i)
Add axioms stating that the string res corresponds to the integer argument written in hexadecimal...
Converting each lowercase character of Basic Latin and Latin-1 supplement to the corresponding upperc...
const array_string_exprt & char_array_of_pointer(array_poolt &pool, const exprt &pointer, const exprt &length)
Adds creates a new array if it does not already exists.
Base class for all expressions.
Definition: expr.h:54
std::size_t component_number(const irep_idt &component_name) const
Return the sequence number of the component with given name.
Definition: std_types.cpp:35
const format_textt & get_format_text() const
int8_t s1
Definition: bytecode_info.h:59
const typet & subtype() const
Definition: type.h:38
operandst & operands()
Definition: expr.h:78
constant_exprt from_integer(const mp_integer &int_value, const typet &type)
Struct constructor from list of elements.
Definition: std_expr.h:1920
std::pair< exprt, string_constraintst > add_axioms_from_float_scientific_notation(symbol_generatort &fresh_symbol, const array_string_exprt &res, const exprt &f, array_poolt &array_pool, const namespacet &ns)
Add axioms to write the float in scientific notation.
bitvector_typet char_type()
Definition: c_types.cpp:114
Array constructor from list of elements.
Definition: std_expr.h:1739