{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE CPP #-}
module Basement.UTF8.Helper
where
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Types.OffsetSize
import Basement.UTF8.Types
import Basement.Bits
import GHC.Prim
import GHC.Types
import GHC.Word
maskContinuation# :: Word# -> Word#
maskContinuation# :: Word# -> Word#
maskContinuation# v :: Word#
v = Word# -> Word# -> Word#
and# Word#
v 0x3f##
{-# INLINE maskContinuation# #-}
maskHeader2# :: Word# -> Word#
h :: Word#
h = Word# -> Word# -> Word#
and# Word#
h 0x1f##
{-# INLINE maskHeader2# #-}
maskHeader3# :: Word# -> Word#
h :: Word#
h = Word# -> Word# -> Word#
and# Word#
h 0xf##
{-# INLINE maskHeader3# #-}
maskHeader4# :: Word# -> Word#
h :: Word#
h = Word# -> Word# -> Word#
and# Word#
h 0x7##
{-# INLINE maskHeader4# #-}
or3# :: Word# -> Word# -> Word# -> Word#
or3# :: Word# -> Word# -> Word# -> Word#
or3# a :: Word#
a b :: Word#
b c :: Word#
c = Word# -> Word# -> Word#
or# Word#
a (Word# -> Word# -> Word#
or# Word#
b Word#
c)
{-# INLINE or3# #-}
or4# :: Word# -> Word# -> Word# -> Word# -> Word#
or4# :: Word# -> Word# -> Word# -> Word# -> Word#
or4# a :: Word#
a b :: Word#
b c :: Word#
c d :: Word#
d = Word# -> Word# -> Word#
or# (Word# -> Word# -> Word#
or# Word#
a Word#
b) (Word# -> Word# -> Word#
or# Word#
c Word#
d)
{-# INLINE or4# #-}
toChar# :: Word# -> Char
toChar# :: Word# -> Char
toChar# w :: Word#
w = Char# -> Char
C# (Int# -> Char#
chr# (Word# -> Int#
word2Int# Word#
w))
{-# INLINE toChar# #-}
toChar1 :: StepASCII -> Char
toChar1 :: StepASCII -> Char
toChar1 (StepASCII (W8# w :: Word#
w)) = Char# -> Char
C# (Word# -> Char#
word8ToChar# Word#
w)
toChar2 :: StepASCII -> Word8 -> Char
toChar2 :: StepASCII -> Word8 -> Char
toChar2 (StepASCII (W8# b1 :: Word#
b1)) (W8# b2 :: Word#
b2) =
Word# -> Char
toChar# (Word# -> Word# -> Word#
or# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader2# Word#
w1) 6#) (Word# -> Word#
maskContinuation# Word#
w2))
where
w1 :: Word#
w1 = Word# -> Word#
word8ToWord# Word#
b1
w2 :: Word#
w2 = Word# -> Word#
word8ToWord# Word#
b2
toChar3 :: StepASCII -> Word8 -> Word8 -> Char
toChar3 :: StepASCII -> Word8 -> Word8 -> Char
toChar3 (StepASCII (W8# b1 :: Word#
b1)) (W8# b2 :: Word#
b2) (W8# b3 :: Word#
b3) =
Word# -> Char
toChar# (Word# -> Word# -> Word# -> Word#
or3# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader3# Word#
w1) 12#)
(Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskContinuation# Word#
w2) 6#)
(Word# -> Word#
maskContinuation# Word#
w3)
)
where
w1 :: Word#
w1 = Word# -> Word#
word8ToWord# Word#
b1
w2 :: Word#
w2 = Word# -> Word#
word8ToWord# Word#
b2
w3 :: Word#
w3 = Word# -> Word#
word8ToWord# Word#
b3
toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
toChar4 (StepASCII (W8# b1 :: Word#
b1)) (W8# b2 :: Word#
b2) (W8# b3 :: Word#
b3) (W8# b4 :: Word#
b4) =
Word# -> Char
toChar# (Word# -> Word# -> Word# -> Word# -> Word#
or4# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader4# Word#
w1) 18#)
(Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskContinuation# Word#
w2) 12#)
(Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskContinuation# Word#
w3) 6#)
(Word# -> Word#
maskContinuation# Word#
w4)
)
where
w1 :: Word#
w1 = Word# -> Word#
word8ToWord# Word#
b1
w2 :: Word#
w2 = Word# -> Word#
word8ToWord# Word#
b2
w3 :: Word#
w3 = Word# -> Word#
word8ToWord# Word#
b3
w4 :: Word#
w4 = Word# -> Word#
word8ToWord# Word#
b4
data UTF8Char =
UTF8_1 {-# UNPACK #-} !Word8
| UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
asUTF8Char :: Char -> UTF8Char
asUTF8Char :: Char -> UTF8Char
asUTF8Char !(C# c :: Char#
c)
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x80## ) = UTF8Char
encode1
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x800## ) = UTF8Char
encode2
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x10000##) = UTF8Char
encode3
| Bool
otherwise = UTF8Char
encode4
where
!x :: Word#
x = Int# -> Word#
int2Word# (Char# -> Int#
ord# Char#
c)
encode1 :: UTF8Char
encode1 = Word8 -> UTF8Char
UTF8_1 (Word# -> Word8
W8# (Word# -> Word#
wordToWord8# Word#
x))
encode2 :: UTF8Char
encode2 =
let !x1 :: Word8
x1 = Word# -> Word8
W8# (Word# -> Word#
wordToWord8# (Word# -> Word# -> Word#
or# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 6#) 0xc0##))
!x2 :: Word8
x2 = Word# -> Word8
toContinuation Word#
x
in Word8 -> Word8 -> UTF8Char
UTF8_2 Word8
x1 Word8
x2
encode3 :: UTF8Char
encode3 =
let !x1 :: Word8
x1 = Word# -> Word8
W8# (Word# -> Word#
wordToWord8# (Word# -> Word# -> Word#
or# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 12#) 0xe0##))
!x2 :: Word8
x2 = Word# -> Word8
toContinuation (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 6#)
!x3 :: Word8
x3 = Word# -> Word8
toContinuation Word#
x
in Word8 -> Word8 -> Word8 -> UTF8Char
UTF8_3 Word8
x1 Word8
x2 Word8
x3
encode4 :: UTF8Char
encode4 =
let !x1 :: Word8
x1 = Word# -> Word8
W8# (Word# -> Word#
wordToWord8# (Word# -> Word# -> Word#
or# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 18#) 0xf0##))
!x2 :: Word8
x2 = Word# -> Word8
toContinuation (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 12#)
!x3 :: Word8
x3 = Word# -> Word8
toContinuation (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 6#)
!x4 :: Word8
x4 = Word# -> Word8
toContinuation Word#
x
in Word8 -> Word8 -> Word8 -> Word8 -> UTF8Char
UTF8_4 Word8
x1 Word8
x2 Word8
x3 Word8
x4
toContinuation :: Word# -> Word8
toContinuation :: Word# -> Word8
toContinuation w :: Word#
w = Word# -> Word8
W8# (Word# -> Word#
wordToWord8# (Word# -> Word# -> Word#
or# (Word# -> Word# -> Word#
and# Word#
w 0x3f##) 0x80##))
{-# INLINE toContinuation #-}
numBytes :: UTF8Char -> CountOf Word8
numBytes :: UTF8Char -> CountOf Word8
numBytes UTF8_1{} = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 1
numBytes UTF8_2{} = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 2
numBytes UTF8_3{} = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 3
numBytes UTF8_4{} = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 4
skipNextHeaderValue :: Word8 -> CountOf Word8
!Word8
x
| Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xC0 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 1
| Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xE0 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 2
| Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xF0 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 3
| Bool
otherwise = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 4
{-# INLINE skipNextHeaderValue #-}
headerIsAscii :: StepASCII -> Bool
(StepASCII x :: Word8
x) = Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80
charToBytes :: Int -> CountOf Word8
charToBytes :: Int -> CountOf Word8
charToBytes c :: Int
c
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 1
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0x800 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 2
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0x10000 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 3
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0x110000 = Int -> CountOf Word8
forall ty. Int -> CountOf ty
CountOf 4
| Bool
otherwise = [Char] -> CountOf Word8
forall a. HasCallStack => [Char] -> a
error ("invalid code point: " [Char] -> [Char] -> [Char]
forall a. Monoid a => a -> a -> a
`mappend` Int -> [Char]
forall a. Show a => a -> [Char]
show Int
c)
encodeCharUTF8 :: Char -> CharUTF8
encodeCharUTF8 :: Char -> CharUTF8
encodeCharUTF8 !(C# c :: Char#
c)
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x80## ) = Word32 -> CharUTF8
CharUTF8 (Word# -> Word32
W32# (Word# -> Word#
wordToWord32# Word#
x))
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x800## ) = Word32 -> CharUTF8
CharUTF8 (Word# -> Word32
W32# (Word# -> Word#
wordToWord32# Word#
encode2))
| Int# -> Bool
bool# (Word# -> Word# -> Int#
ltWord# Word#
x 0x10000##) = Word32 -> CharUTF8
CharUTF8 (Word# -> Word32
W32# (Word# -> Word#
wordToWord32# Word#
encode3))
| Bool
otherwise = Word32 -> CharUTF8
CharUTF8 (Word# -> Word32
W32# (Word# -> Word#
wordToWord32# Word#
encode4))
where
!x :: Word#
x = Int# -> Word#
int2Word# (Char# -> Int#
ord# Char#
c)
mask2 :: Word#
mask2 = 0x0000bfdf##
mask3 :: Word#
mask3 = 0x00bfbfef##
mask4 :: Word#
mask4 = 0xbfbfbff7##
set2 :: Word#
set2 = 0x000080c0##
set3 :: Word#
set3 = 0x008080e0##
set4 :: Word#
set4 = 0x808080f0##
encode2 :: Word#
encode2 = Word# -> Word# -> Word#
and# Word#
mask2 (Word# -> Word# -> Word# -> Word#
or3# Word#
set2
(Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 6#)
(Word# -> Int# -> Word#
uncheckedShiftL# Word#
x 8# )
)
encode3 :: Word#
encode3 = Word# -> Word# -> Word#
and# Word#
mask3 (Word# -> Word# -> Word# -> Word# -> Word#
or4# Word#
set3
(Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 12#)
(Word# -> Word# -> Word#
and# 0x3f00## (Word# -> Int# -> Word#
uncheckedShiftL# Word#
x 2#))
(Word# -> Int# -> Word#
uncheckedShiftL# Word#
x 16# )
)
encode4 :: Word#
encode4 = Word# -> Word# -> Word#
and# Word#
mask4 (Word# -> Word# -> Word# -> Word# -> Word#
or4# Word#
set4
(Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 18#)
(Word# -> Word# -> Word#
or# (Word# -> Word# -> Word#
and# 0x3f00## (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
x 4#))
(Word# -> Word# -> Word#
and# 0x3f0000## (Word# -> Int# -> Word#
uncheckedShiftL# Word#
x 10#))
)
(Word# -> Int# -> Word#
uncheckedShiftL# Word#
x 24# )
)
decodeCharUTF8 :: CharUTF8 -> Char
decodeCharUTF8 :: CharUTF8 -> Char
decodeCharUTF8 c :: CharUTF8
c@(CharUTF8 !(W32# w_ :: Word#
w_))
| CharUTF8 -> Bool
isCharUTF8Case1 CharUTF8
c = Word# -> Char
toChar# Word#
w
| CharUTF8 -> Bool
isCharUTF8Case2 CharUTF8
c = Char
encode2
| CharUTF8 -> Bool
isCharUTF8Case3 CharUTF8
c = Char
encode3
| Bool
otherwise = Char
encode4
where
w :: Word#
w = Word# -> Word#
word32ToWord# Word#
w_
encode2 :: Char
encode2 =
Word# -> Char
toChar# (Word# -> Word# -> Word#
or# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader2# Word#
w) 6#)
(Word# -> Word#
maskContinuation# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
w 8#))
)
encode3 :: Char
encode3 =
Word# -> Char
toChar# (Word# -> Word# -> Word# -> Word#
or3# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader3# Word#
w) 12#)
(Word# -> Int# -> Word#
uncheckedShiftRL# (Word# -> Word# -> Word#
and# 0x3f00## Word#
w) 8#)
(Word# -> Word#
maskContinuation# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
w 16#))
)
encode4 :: Char
encode4 =
Word# -> Char
toChar# (Word# -> Word# -> Word# -> Word# -> Word#
or4# (Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word#
maskHeader4# Word#
w) 18#)
(Word# -> Int# -> Word#
uncheckedShiftRL# (Word# -> Word# -> Word#
and# 0x3f00## Word#
w) 10#)
(Word# -> Int# -> Word#
uncheckedShiftL# (Word# -> Word# -> Word#
and# 0x3f0000## Word#
w) 4#)
(Word# -> Word#
maskContinuation# (Word# -> Int# -> Word#
uncheckedShiftRL# Word#
w 24#))
)
isCharUTF8Case1 :: CharUTF8 -> Bool
isCharUTF8Case1 :: CharUTF8 -> Bool
isCharUTF8Case1 (CharUTF8 !Word32
w) = (Word32
w Word32 -> Word32 -> Word32
forall bits. BitOps bits => bits -> bits -> bits
.&. 0x80) Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== 0
{-# INLINE isCharUTF8Case1 #-}
isCharUTF8Case2 :: CharUTF8 -> Bool
isCharUTF8Case2 :: CharUTF8 -> Bool
isCharUTF8Case2 (CharUTF8 !Word32
w) = (Word32
w Word32 -> Word32 -> Word32
forall bits. BitOps bits => bits -> bits -> bits
.&. 0x20) Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== 0
{-# INLINE isCharUTF8Case2 #-}
isCharUTF8Case3 :: CharUTF8 -> Bool
isCharUTF8Case3 :: CharUTF8 -> Bool
isCharUTF8Case3 (CharUTF8 !Word32
w) = (Word32
w Word32 -> Word32 -> Word32
forall bits. BitOps bits => bits -> bits -> bits
.&. 0x10) Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== 0
{-# INLINE isCharUTF8Case3 #-}
isCharUTF8Case4 :: CharUTF8 -> Bool
isCharUTF8Case4 :: CharUTF8 -> Bool
isCharUTF8Case4 (CharUTF8 !Word32
w) = (Word32
w Word32 -> Word32 -> Word32
forall bits. BitOps bits => bits -> bits -> bits
.&. 0x08) Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== 0
{-# INLINE isCharUTF8Case4 #-}