Libparserutils
Macros | Variables
utf8impl.h File Reference

UTF-8 manipulation macros (implementation). More...

#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

Go to the source code of this file.

Macros

#define UTF8_TO_UCS4(s, len, ucs4, clen, error)
 Convert a UTF-8 multibyte sequence into a single UCS-4 character.
 
#define UTF8_FROM_UCS4(ucs4, s, len, error)
 Convert a single UCS-4 character into a UTF-8 multibyte sequence.
 
#define UTF8_LENGTH(s, max, len, error)
 Calculate the length (in characters) of a bounded UTF-8 string.
 
#define UTF8_CHAR_BYTE_LENGTH(s, len, error)
 Calculate the length (in bytes) of a UTF-8 character.
 
#define UTF8_PREV(s, off, prevoff, error)
 Find previous legal UTF-8 char in string.
 
#define UTF8_NEXT(s, len, off, nextoff, error)
 Find next legal UTF-8 char in string.
 
#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)
 Skip to start of next sequence in UTF-8 input.
 

Variables

const uint8_t numContinuations [256]
 Number of continuation bytes for a given start byte.
 

Detailed Description

UTF-8 manipulation macros (implementation).

Definition in file utf8impl.h.

Macro Definition Documentation

◆ UTF8_CHAR_BYTE_LENGTH

#define UTF8_CHAR_BYTE_LENGTH (   s,
  len,
  error 
)
Value:
do { \
if (s == NULL || len == NULL) { \
break; \
} \
\
*len = numContinuations[s[0]] + 1 /* Start byte */; \
\
error = PARSERUTILS_OK; \
} while(0)
size_t len
Definition codec_8859.c:23
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_BADPARM
Definition errors.h:22
const uint8_t numContinuations[256]
Number of continuation bytes for a given start byte.
Definition utf8.c:20

Calculate the length (in bytes) of a UTF-8 character.

Parameters
sPointer to start of character
lenPointer to location to receive length
errorLocation to receive error code

Definition at line 228 of file utf8impl.h.

◆ UTF8_FROM_UCS4

#define UTF8_FROM_UCS4 (   ucs4,
  s,
  len,
  error 
)

Convert a single UCS-4 character into a UTF-8 multibyte sequence.

Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This macro conforms to RFC2279, however.

Parameters
ucs4The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
sPointer to pointer to output buffer, updated on exit
lenPointer to length, in bytes, of output buffer, updated on exit
errorLocation to receive error code

Definition at line 123 of file utf8impl.h.

◆ UTF8_LENGTH

#define UTF8_LENGTH (   s,
  max,
  len,
  error 
)

Calculate the length (in characters) of a bounded UTF-8 string.

Parameters
sThe string
maxMaximum length
lenPointer to location to receive length of string
errorLocation to receive error code

Definition at line 182 of file utf8impl.h.

◆ UTF8_NEXT

#define UTF8_NEXT (   s,
  len,
  off,
  nextoff,
  error 
)
Value:
do { \
if (s == NULL || off >= len || nextoff == NULL) { \
break; \
} \
\
/* Skip current start byte (if present - may be mid-sequence) */\
if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
off++; \
\
while (off < len && (s[off] & 0xC0) == 0x80) \
off++; \
\
*nextoff = off; \
\
error = PARSERUTILS_OK; \
} while(0)

Find next legal UTF-8 char in string.

Parameters
sThe string (assumed valid)
lenMaximum offset in string
offOffset in the string to start at
nextoffPointer to location to receive offset of first byte of next legal character
errorLocation to receive error code

Definition at line 274 of file utf8impl.h.

◆ UTF8_NEXT_PARANOID

#define UTF8_NEXT_PARANOID (   s,
  len,
  off,
  nextoff,
  error 
)

Skip to start of next sequence in UTF-8 input.

Parameters
sThe string (assumed to be of dubious validity)
lenMaximum offset in string
offOffset in the string to start at
nextoffPointer to location to receive offset of first byte of next legal character
errorLocation to receive error code

Definition at line 303 of file utf8impl.h.

◆ UTF8_PREV

#define UTF8_PREV (   s,
  off,
  prevoff,
  error 
)
Value:
do { \
if (s == NULL || prevoff == NULL) { \
break; \
} \
\
while (off != 0 && (s[--off] & 0xC0) == 0x80) \
/* do nothing */; \
\
*prevoff = off; \
\
error = PARSERUTILS_OK; \
} while(0)

Find previous legal UTF-8 char in string.

Parameters
sThe string
offOffset in the string to start at
prevoffPointer to location to receive offset of first byte of previous legal character
errorLocation to receive error code

Definition at line 249 of file utf8impl.h.

◆ UTF8_TO_UCS4

#define UTF8_TO_UCS4 (   s,
  len,
  ucs4,
  clen,
  error 
)

Convert a UTF-8 multibyte sequence into a single UCS-4 character.

Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This macro conforms to RFC2279, however.

Parameters
sThe sequence to process
lenLength of sequence
ucs4Pointer to location to receive UCS-4 character (host endian)
clenPointer to location to receive byte length of UTF-8 sequence
errorLocation to receive error code

Definition at line 34 of file utf8impl.h.

Variable Documentation

◆ numContinuations

const uint8_t numContinuations[256]
extern

Number of continuation bytes for a given start byte.

Definition at line 20 of file utf8.c.