hunk ./Codec/Binary/UTF8/Generic.hs 2 --- | This module provides fast, validated encoding and decoding functions --- between 'ByteString's and 'String's. It does not exactly match the --- output of the Codec.Binary.UTF8.String output for invalid encodings --- as the number of replacement characters is sometimes longer. +-- +-- | +-- Module : Codec.Binary.UTF8.Generic +-- Copyright : (c) Iavor S. Diatchki 2009 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- hunk ./Codec/Binary/UTF8/Generic.hs 161 - Just (d,_) | d >= 0x10000 -> (toEnum d, 4) - | otherwise -> (replacement_char, 4) + Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) + | otherwise -> (replacement_char, 4) hunk ./Codec/Binary/UTF8/String.hs 19 + + , isUTF8Encoded + , utf8Encode hunk ./Codec/Binary/UTF8/String.hs 24 -import Data.Word (Word8) +import Data.Word (Word8,Word32) hunk ./Codec/Binary/UTF8/String.hs 101 + +-- | @utf8Encode str@ is a convenience function; checks to see if +-- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but +-- you are better off keeping track of the encoding so as to avoid +-- the cost of checking. +utf8Encode :: String -> String +utf8Encode str + | isUTF8Encoded str = str + | otherwise = encodeString str + + +-- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form. +isUTF8Encoded :: String -> Bool +isUTF8Encoded [] = True +isUTF8Encoded (x:xs) = + case ox of + _ | ox < 0x80 -> isUTF8Encoded xs + | ox > 0xff -> False + | ox < 0xc0 -> False + | ox < 0xe0 -> check1 + | ox < 0xf0 -> check_byte 2 0xf 0 + | ox < 0xf8 -> check_byte 3 0x7 0x10000 + | ox < 0xfc -> check_byte 4 0x3 0x200000 + | ox < 0xfe -> check_byte 5 0x1 0x4000000 + | otherwise -> False + where + ox = toW32 x + + toW32 :: Char -> Word32 + toW32 ch = fromIntegral (fromEnum ch) + + check1 = + case xs of + [] -> False + c1 : ds + | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False + | otherwise -> isUTF8Encoded ds + where + oc = toW32 c1 + d = ((ox .&. 0x1f) `shiftL` 6) .|. (oc .&. 0x3f) + + check_byte :: Int -> Word32 -> Word32 -> Bool + check_byte i mask overlong = aux i xs (ox .&. mask) + where + aux 0 rs acc + | overlong <= acc && + acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs + | otherwise = False + + aux n (r:rs) acc + | toW32 r .&. 0xc0 == 0x80 = + aux (n-1) rs (acc `shiftL` 6 .|. (toW32 r .&. 0x3f)) + + aux _ _ _ = False + hunk ./Data/ByteString/Lazy/UTF8.hs 1 --- | This module provides fast, validated encoding and decoding functions +-- +-- | +-- Module : Data.ByteString.Lazy.UTF8 +-- Copyright : (c) Iavor S. Diatchki 2009 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +-- This module provides fast, validated encoding and decoding functions hunk ./Data/ByteString/Lazy/UTF8.hs 117 - Just (d,_) | d >= 0x10000 -> (toEnum d, 4) - | otherwise -> (replacement_char, 4) + Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) + | otherwise -> (replacement_char, 4) hunk ./Data/ByteString/UTF8.hs 1 --- | This module provides fast, validated encoding and decoding functions +-- +-- | +-- Module : Data.ByteString.UTF8 +-- Copyright : (c) Iavor S. Diatchki 2009 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +-- This module provides fast, validated encoding and decoding functions hunk ./Data/ByteString/UTF8.hs 116 - Just (d,_) | d >= 0x10000 -> (toEnum d, 4) - | otherwise -> (replacement_char, 4) + Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) + | otherwise -> (replacement_char, 4) hunk ./Data/String/UTF8.hs 1 -{-# LANGUAGE MultiParamTypeClasses #-} -{-# OPTIONS_GHC -fallow-undecidable-instances #-} +-- +-- | +-- Module : Data.String.UTF8 +-- Copyright : (c) Iavor S. Diatchki 2009 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +{-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-} hunk ./Data/String/UTF8.hs 48 --- | The type of strngs that are represented using tthe UTF8 encoding. +-- | The type of strngs that are represented using the UTF8 encoding. hunk ./utf8-string.cabal 2 -Version: 0.3.4 +Version: 0.3.6 hunk ./utf8-string.cabal 32 + System.Environment.UTF8 adddir ./System/Environment addfile ./System/Environment/UTF8.hs hunk ./System/Environment/UTF8.hs 1 +-- +-- | +-- Module : System.Environment.UTF8 +-- Copyright : (c) Eric Mertens 2009 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer: emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +-- Support for UTF-8 based environment manipulation +-- +module System.Environment.UTF8 + (getArgs, getProgName, getEnv, withArgs, withProgName, getEnvironment) + where + +import Codec.Binary.UTF8.String (decodeString) +import qualified System.Environment as Sys + +getArgs :: IO [String] +getArgs = map decodeString `fmap` Sys.getArgs + +getProgName :: IO String +getProgName = decodeString `fmap` Sys.getProgName + +getEnv :: String -> IO String +getEnv x = decodeString `fmap` Sys.getEnv x + +withArgs :: [String] -> IO a -> IO a +withArgs = Sys.withArgs + +withProgName :: String -> IO a -> IO a +withProgName = Sys.withProgName + +getEnvironment :: IO [(String,String)] +getEnvironment = map f `fmap` Sys.getEnvironment + where f (a,b) = (decodeString a, decodeString b)