> {-# OPTIONS -fglasgow-exts #-} > module QuicQuid.Term(Term(..),parse,showCanonical,showJSON1) where > import Data.List > import Data.Char (ord, chr, isControl,isHexDigit,isLetter,isDigit) > import Text.Parsers.Frisby > import Data.Maybe > import qualified Data.Map as M > import Data.Word > import Data.Bits > import Numeric > import Text.JSON A simple syntax used to: - express and transfer data across multiple systems and programming languages - formulate requests and offers for information (ask/answer) - code query results (bindings and channels) Requirements: - easy to read and write for humans (regular and compact) - expressive, able to code any possible data structure and both textual and binary information - extensible (supports any possible future semantic) - easy and efficient to produce (from the host language data structures or by 'manually' concatenating strings) - easy and efficient to parse - safe: no undesired side effects can be produced by parsing (OR BY TRANSFERING VIA JSON) Other Desiderata: - Compatible/Similar/Convertible with existing formats (JSON,..) - Multiple copies of the same information do not need to be replicated - Support for cycles and infinite data structures TODO: - Switch from String to bytestring (with support for UTF characters, see -XOverloadedStrings and the IsString class) or stringtable-atom - GADT implementation (to preserve type info about the kind of term being created). - CHECK: should "and"/"or" be defined as operands at this level, or is it ok to have them as predicates? - CHECK: for operands like "and" or "or", we might need a concept of 'set', a sequence where order is irrelevant (is it irrelevant?). - CHECK: are we missing a 'binary' format (might also be added as a predicate) - Add support for 'netstrings': 8:hi there an UTF-8 coded string, 8 UNICODE characters long (NOT 8 bytes) - ?? Add support to specify codings: 3#utf-8#a"m a 3 byte long binary data, to be interpreted as utf-8 > -- |The abstract syntax (a superset of JSON): > data Term = Null -- ^A null/undefined value, the only member of the 1-sized set. > | T -- ^True > | F -- ^False > | Str String -- ^An Unicode string, of unspecified length > | Num Double -- ^A double precision number > | Arr [Term] -- ^An array (ordered sequence) of terms, can also be seen as a special case of an Obj whose keys are Numbers (Integers actually). > | Obj (M.Map String Term) -- ^An object (in the JavaScript/JSON sense) NOTE: used to be Map Term Term, maybe we don't need this additional complexity. > | App Term Term -- ^The 'application' of a term to another term, a generic mechanism to build more complex terms (TODO: decide if the name is appropriate) > | Var String -- ^A named variable. It can also be seen as the application of "?" to a String. > deriving (Eq,Ord,Show) > -- instance Show Term where show = showCanonical Note: There are different kinds of maps that are in common use, from specific to more general: array = Map Integer Term [ a b c ] binding = Map String Term {_:"?" "x":b "c":d(f)} obj/map = Map Term Term {salary(2007):100 salary(2008):200} function = Map Pattern Term { ?n->mul(?n 2)} even recursive with cc {0:1 ?n:mul(?n KK(sub(?n 1)))} Syntactically they might all be treated uniformly if we define a canonical mapping Term -> String. The abstract syntax can be converted from/to: -- a "loose" encoding, for human input/visualisation -- a "canonical" encoding, for comparison/matching and digital signatures -- a "transport" encoding, for transmission/storage -- native data structures or code of any modern programming language Requirements for the transport encoding: * Parsable from any language using existing parsers (therefore: JSON or XML). * Reasonably easy to use directly from "raw/as received" form (JSON/XML). * Correct/Unambiguous (1<->1 mapping with terms) * Safe under evaluation: agents cannot inject executable code. > -- |Show term in canonical form, every term has a single canonical representation. > -- TODO: revise (e.g. use netstrings, see http://en.wikipedia.org/wiki/Canonical_S-expressions). > showCanonical = sh1 showCanonical > {-| Show term in JSON. > Simple terms are coded directly in JSON. > The others are coded using the JSON Object: > Every object has two properties: > "t": the "type" of the term coded by the object. > "v": the coded term (the "value"). > Examples: > TYPE CANONICAL JSON > Variable ?xyz {"t":"v" ,"v":"xyx"} > Application f x {"t":"a" ,"v":[f,x]} > Map String Term {"k":o,..} {"t":"o" ,"v":{"k":o} > > TODO?: > Map Term Term {o(4):..} {"t":"m" ,"v":[[k1,..kn],[v1..vn]} > -} > showJSON1 ( Var s) = tv "v" (encode s) > showJSON1 (App h t) = tv "a" $ "[" ++ showJSON1 h ++ "," ++ showJSON1 t ++ "]" > showJSON1 o@(Obj m) = tv "o" $ shJ o > showJSON1 t = shJ t > tv t v = "{\"t\":\"" ++ t ++ "\",\"v\":" ++ v ++ "}" > shJ = sh1 showJSON1 -- |Show term in a form that can be directly parsed by the JavaScript "eval" function (with a few extra definitions) -- TODO: support Obj (JavaScript has not generic term keys). -- Prob: execution cannot take directly place in json..() as we do not control evaluation context -- so it has to be returned as a string and the evaluated in an appropriate context. showJavaScript (Var s) = "v(" ++ showJSON s ++ ")" -- TODO: is this correct ? or should be a generic application? showJavaScript (App h t) = "f(" ++ shJ h ++ "," ++ shJ t ++ ")" showJavaScript t = sh1 showJavaScript t shJ = sh1 showJavaScript > -- |TODO: Fix inefficiency (use bytestring, optimise string appending with ShowS) > sh1 _ (Null) = "null" > sh1 _ (T) = "true" > sh1 _ (F) = "false" > sh1 _ (Str s) = encode s > sh1 _ (Num n) = encode n > sh1 _ (Var s) = "?" ++ show s > sh1 f (App h t) = f h ++ "(" ++ f t ++ ")" > sh1 f (Arr l) = "[" ++ (intercalate "," $ map f l) ++ "]" > -- BUG: order keys according to a canonical order > sh1 f (Obj m) = "{" ++ (intercalate "," $ map (\(k,v) -> f (Str k) ++ ":" ++ f v) (M.toList m)) ++ "}" > -- |Show a Haskell String as a JSON string. > -- |Parse a term from a 'loose' String representation. > parse :: String -> Maybe Term > parse = runPeg term > -- | The term parser > -- Application is left associative: a b c = (a b) c > term :: PM s (P s (Maybe Term)) > term = mdo > logic <- newRule $ fmap Just (term <<- optional space <<- eof) // unit Nothing > term <- newRule $ many1 (simple // (optional space ->> char '(' ->> term <<- optional space <<- char ')' )) ## una > simple <- newRule $ optional space ->> (null // t // f // double // var // string // seq // obj) > null <- newRule $ text "null" ##> Null > t <- newRule $ text "true" ##> T > f <- newRule $ text "false" ##> F > obj <- newRule $ char '{' ->> option [] (couple <> many (optional space ->> char ',' ->> couple) ## uncurry (:)) <<- optional space <<- char '}' ## Obj . M.fromList > seq <- newRule $ char '[' ->> option [] (term <> many (optional space ->> char ',' ->> term) ## uncurry (:)) <<- optional space <<- char ']' ## Arr > couple <- newRule $ optional space ->> str <<- optional space <> char ':' ->> term > double <- newRule $ (option "" $ text "-") <++> integer <++> option "" fraq <++> option "" exp ## (\n -> Num $ (read n :: Double)) > integer <- newRule $ many1 (oneOf ['0' .. '9']) > fraq <- newRule $ text "." <++> integer > exp <- newRule $ (oneOf ['e','E'] ##> "E") <++> (option "" $ text "+" // text "-") <++> integer > var <- newRule $ char '?' ->> (option "" $ text "_") <++> (option "" str) ## Var > string <- newRule $ str ## Str > str <- newRule $ (cstring // symbol1 // symbol2) > symbol1 <- newRule $ many1 (symChr) > symChr <- newRule $ oneOf $ ['!'] ++ ['#'..'&'] ++ ['*'..'+'] ++ ['-'..'/'] ++ [';'..'<'] ++ ['>'..'@'] ++ ['^'..'_'] ++ ['|'] ++ ['~'] -- TODO: CHECK [':'] ['='] [','] > symbol2 <- newRule $ onlyIf anyChar isLetter <> many (onlyIf anyChar (\c -> isLetter c || isDigit c)) ## \(h,t) -> h:t > cstring <- newRule $ char '"' ->> (many $ strCh) <<- char '"' > strCh <- newRule $ plainCh // escCh > plainCh <- newRule $ onlyIf anyChar (\c -> not (c=='"' || c=='\\')) -- || isControl c)) > escCh <- newRule $ char '\\' ->> (oneOf ['"','\\','/'] // char 'b' ##> '\b' // char 'f' ##> '\f' // char 'n' ##> '\n' // char 'r' ##> '\r' // char 't' ##> '\t' // unicode) > unicode <- newRule $ char 'u' ->> (hexCh <> hexCh <> hexCh <> hexCh) ## \(((x1,x2),x3),x4) -> uni2char x1 x2 x3 x4 > hexCh <- newRule $ onlyIf anyChar isHexDigit > space <- newRule $ many1 $ char ' ' > return logic > where > una [e] = e > una l = App (una $ init l) (last l) > uni2char x1 x2 x3 x4 = read ("'\\x" ++ [x1,x2,x3,x4] ++ "'")