-- -- Lexical syntax for Haskell 98. -- -- (c) Simon Marlow 2003, with the caveat that much of this is -- translated directly from the syntax in the Haskell 98 report. -- { module HaskellScanner ( Token(..), scan, alexStartPos, lit_comment ) where import Data.Char ( ord ) import Data.Bits -- GHC import StringBuffer } $whitechar = [\ \t\n\r\f\v] $special = [\(\)\,\;\[\]\`\{\}] $ascdigit = 0-9 $unidigit = [] -- TODO $digit = [$ascdigit $unidigit] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~] $unisymbol = [] -- TODO $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\'] $large = [A-Z \xc0-\xd6 \xd8-\xde] $small = [a-z \xdf-\xf6 \xf8-\xff \_] $alpha = [$small $large] $graphic = [$small $large $symbol $digit $special \:\"\'] $octit = 0-7 $hexit = [0-9 A-F a-f] $idchar = [$alpha $digit \'] $symchar = [$symbol \:] $nl = [\n\r] @reservedid = as|case|class|data|default|deriving|do|else|hiding|if| import|in|infix|infixl|infixr|instance|let|module|newtype| of|qualified|then|type|where|forall|mdo|foreign|export|dynamic| safe|threadsafe|unsafe|stdcall|ccall|dotnet @reservedop = ".." | ":" | "::" | "=" | \\ | "|" | "<-" | "->" | "@" | "~" | "=>" @varid = $small $idchar* @conid = $large $idchar* @varsym = $symbol $symchar* @consym = \: $symchar* @decimal = $digit+ @octal = $octit+ @hexadecimal = $hexit+ @exponent = [eE] [\-\+] @decimal $cntrl = [$large \@\[\\\]\^\_] @ascii = \^ $cntrl | NUL | SOH | STX | ETX | EOT | ENQ | ACK | BEL | BS | HT | LF | VT | FF | CR | SO | SI | DLE | DC1 | DC2 | DC3 | DC4 | NAK | SYN | ETB | CAN | EM | SUB | ESC | FS | GS | RS | US | SP | DEL $charesc = [abfnrtv\\\"\'\&] @escape = \\ ($charesc | @ascii | @decimal | o @octal | x @hexadecimal) @gap = \\ $whitechar+ \\ @string = $graphic # [\"\\] | " " | @escape | @gap haskell :- ^"\begin{code}" $white* \n { mkL (LChangeState LLitComment lit_code) } ^> { mkL (LChangeState LLitComment lit_codeline) } ^(\n | [^>].* \n) { mkL LLitComment } ^"\end{code}" $white* \n { mkL (LChangeState LLitComment lit_comment) } ($whitechar # \n)+ { mkL LWhite } \n { mkL (LChangeState LWhite lit_comment) } <0,lit_code> $white+ { mkL LWhite } <0,lit_code,lit_codeline> { "--"\-*[^$symbol].* { mkL LLineComment } "--"\-* / { atEOL } { mkL LLineComment } "{-" { mkL LNestedComment } $special { mkL LSpecial } @reservedid { mkL LReservedId } @varid { mkL LVarId } @conid { mkL LConId } @reservedop { mkL LReservedOp } @varsym { mkL LVarSym } @consym { mkL LConSym } @decimal | 0[oO] @octal | 0[xX] @hexadecimal { mkL LInteger } @decimal \. @decimal @exponent? | @decimal @exponent { mkL LFloat } \' ($graphic # [\'\\] | " " | @escape) \' { mkL LChar } \" @string* \" { mkL LString } } { data Token = LInteger | LFloat | LChar | LString | LSpecial | LReservedId | LReservedOp | LVarId | LConId | LVarSym | LConSym | LEOF | LError -- this character is not part of a valid lexeme | LWhite | LLineComment | LLitComment | LNestedComment | LChangeState Token {-# UNPACK #-} !Int deriving Eq mkL = id --mkL :: LexemeClass -> AlexInput -> Int -> Alex Lexeme --mkL c (p,_,str) len = return (L p c (take len str)) type ScannerState = Int -- for now -- bits 0-7: startcode -- (0 == plain .hs, -- 1 == literate, in \begin{code} -- 2 == literate, in code line (bird-track) -- 3 == literate, not in code) -- bits 8-31: comment nesting stateMask = 0xff stateShift = 8 mkState startcode nesting = startcode .|. (nesting `shiftL` stateShift) scan :: AlexInput -> ScannerState -> (AlexInput, Int, Token, ScannerState) scan input state = let startcode = state .&. stateMask nesting = state `shiftR` stateShift in if nesting > 0 && startcode /= lit_comment then nested_comment input 0 nesting startcode else case alexScan input startcode of AlexToken inp len LNestedComment -> nested_comment inp len 1 startcode AlexToken inp len (LChangeState tok s) -> (inp, len, tok, mkState s nesting) AlexToken inp len tok -> (inp, len, tok, mkState startcode nesting) AlexEOF -> (input, 0, LEOF, mkState startcode nesting) AlexError inp -> (inp, 0, LError, mkState startcode nesting) nested_comment :: AlexInput -> Int -> Int -> Int -> (AlexInput, Int, Token, ScannerState) nested_comment inp0 len 0 startcode = (inp0, len, LNestedComment, mkState startcode 0) nested_comment inp0 len nesting startcode | Just ('{',inp1) <- char, Just ('-',inp2) <- alexGetChar inp1 = nested_comment inp2 (len+2) (nesting+1) startcode | Just ('-',inp1) <- char, Just ('}',inp2) <- alexGetChar inp1 = nested_comment inp2 (len+2) (nesting-1) startcode | Just ('\n',inp1) <- char, startcode == lit_codeline = (inp1, len+1, LNestedComment, mkState lit_comment nesting) -- in lit_codeline, drop back to lit_comment if we see a newline | Just (c,inp1) <- char = nested_comment inp1 (len+1) nesting startcode | Nothing <- char = if len == 0 then (inp0, len, LEOF, mkState startcode nesting) else (inp0, len, LNestedComment, mkState startcode nesting) where char = alexGetChar inp0 atEOL _ _ _ (_,strbuf) = atEnd strbuf || (currentChar strbuf == '\n' && atEnd (stepOn strbuf)) -- ---------------------------------------------------------------------------- -- boilerplate stuff type AlexInput = (AlexPosn, -- current position, StringBuffer) -- current input string alexInputPrevChar :: AlexInput -> Char alexInputPrevChar (p,strbuf) = prevChar strbuf '\n' alexGetChar :: AlexInput -> Maybe (Char,AlexInput) alexGetChar (p,strbuf) | atEnd strbuf = Nothing | otherwise = p' `seq` Just (c, (p', stepOn strbuf)) where c = currentChar strbuf p' = alexMove p c data AlexPosn = AlexPn !Int !Int !Int deriving (Eq,Show) alexStartPos :: AlexPosn alexStartPos = AlexPn 0 1 1 alexMove :: AlexPosn -> Char -> AlexPosn alexMove (AlexPn a l c) '\t' = AlexPn (a+1) l (((c+7) `div` 8)*8+1) alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1) 1 alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1) }