-- | The CSV (comma-separated value) format is defined by RFC 4180,
-- \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\",
-- <http://www.rfc-editor.org/rfc/rfc4180.txt>
--
-- This lazy parser can report all CSV formatting errors, whilst also
-- returning all the valid data, so the user can choose whether to
-- continue, to show warnings, or to halt on error.
--
-- Valid fields retain information about their original location in the
-- input, so a secondary parser from textual fields to typed values
-- can give intelligent error messages.
--
-- In a valid CSV file, all rows must have the same number of columns.
-- This parser will flag a row with the wrong number of columns as a error.
-- (But the error type contains the actual data, so the user can recover
-- it if desired.) Completely blank lines are also treated as errors,
-- and again the user is free either to filter these out or convert them
-- to a row of actual null fields.
module Text.CSV.Lazy.ByteString
( -- * CSV types
CSVTable
, CSVRow
, CSVField(..)
-- * CSV parsing
, CSVError(..)
, CSVResult
, csvErrors
, csvTable
, parseCSV
, parseDSV
-- * Pretty-printing
, ppCSVError
, ppCSVField
, ppCSVTable
, ppDSVTable
-- * Conversion between standard and simple representations
, fromCSVTable
, toCSVTable
-- * Selection, validation, and algebra of CSV tables
, selectFields
, expectFields
, mkEmptyColumn
, joinCSV
) where
-- , ppCSVTableAsTuples
import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile)
import Data.Function (on)
import Data.Maybe (fromJust)
import qualified Data.ByteString.Lazy.Char8 as BS
import Data.ByteString.Lazy.Char8 (ByteString)
-- | A CSV table is a sequence of rows. All rows have the same number
-- of fields.
type CSVTable = [CSVRow]
-- | A CSV row is just a sequence of fields.
type CSVRow = [CSVField]
-- | A CSV field's content is stored with its logical row and column number,
-- as well as its textual extent. This information is necessary if you
-- want to generate good error messages in a secondary parsing stage,
-- should you choose to convert the textual fields to typed data values.
data CSVField = CSVField { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldContent :: !ByteString
, csvFieldQuoted :: !Bool }
| CSVFieldError { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldError :: !String }
deriving (Eq,Show)
-- | A structured error type for CSV formatting mistakes.
data CSVError = IncorrectRow { csvRow :: Int
, csvColsExpected :: Int
, csvColsActual :: Int
, csvFields :: [CSVField] }
| BlankLine { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvField :: CSVField }
| FieldError { csvField :: CSVField }
| NoData
deriving (Eq,Show)
-- | The result of parsing a CSV input is a mixed collection of errors
-- and valid rows. This way of representing things is crucial to the
-- ability to parse lazily whilst still catching format errors.
type CSVResult = [ Either [CSVError] [CSVField] ]
-- | Extract just the valid portions of a CSV parse.
csvTable :: CSVResult -> CSVTable
csvTable r = [ row | Right row <- r ]
-- | Extract just the errors from a CSV parse.
csvErrors :: CSVResult -> [CSVError]
csvErrors r = concat [ err | Left err <- r ]
-- | A first-stage parser for CSV (comma-separated values) data.
-- The individual fields remain as text, but errors in CSV formatting
-- are reported. Errors (containing unrecognisable rows/fields) are
-- interspersed with the valid rows/fields.
parseCSV :: ByteString -> CSVResult
parseCSV = parseDSV True ','
-- | Sometimes CSV is not comma-separated, but delimiter-separated
-- values (DSV). The choice of delimiter is arbitrary, but semi-colon
-- is common in locales where comma is used as a decimal point, and tab
-- is also common. The Boolean argument is
-- whether newlines should be accepted within quoted fields. The CSV RFC
-- says newlines can occur in quotes, but other DSV formats might say
-- otherwise. You can often get better error messages if newlines are
-- disallowed.
parseDSV :: Bool -> Char -> ByteString -> CSVResult
parseDSV qn delim = validate
. groupBy ((==)`on`csvRowNum)
. lexCSV qn delim
validate :: [CSVRow] -> CSVResult
validate [] = [Left [NoData]]
validate xs@(x:_) = map (extractErrs (length x)) xs
extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs size row
| length row0 == size && null errs0 = Right row0
| length row0 == 1 && empty field0 = Left [blankLine field0]
| otherwise = Left (map convert errs0
++ validateColumns row0)
where
(row0,errs0) = partition isField row
(field0:_) = row0
isField (CSVField{}) = True
isField (CSVFieldError{}) = False
empty f@(CSVField{}) = BS.null (csvFieldContent f)
empty _ = False
convert err = FieldError {csvField = err}
validateColumns r =
if length r == size then []
else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r)
, csvColsExpected = size
, csvColsActual = length r
, csvFields = r } ]
blankLine f = BlankLine{ csvRow = csvRowNum f
, csvColsExpected = size
, csvColsActual = 1
, csvField = f }
-- Reading CSV data is essentially lexical, and can be implemented with a
-- simple finite state machine. We keep track of logical row number,
-- logical column number (in tabular terms), and textual position (row,col)
-- to enable good error messages.
-- Positional data is retained even after successful lexing, in case a
-- second-stage field parser wants to complain.
--
-- A double-quoted CSV field may contain commas, newlines, and double quotes.
data CSVState = CSVState { tableRow, tableCol :: !Int
, textRow, textCol :: !Int }
deriving Show
incTableRow, incTableCol, incTextRow :: CSVState -> CSVState
incTableRow st = st { tableRow = tableRow st + 1 , tableCol = 1 }
incTableCol st = st { tableCol = tableCol st + 1 }
incTextRow st = st { textRow = textRow st + 1 , textCol = 1 }
incTextCol :: Int -> CSVState -> CSVState
incTextCol n st = st { textCol = textCol st + n }
here :: CSVState -> (Int,Int)
here st = (textRow st, textCol st)
-- Lexer is a small finite state machine.
lexCSV :: Bool -> Char -> ByteString -> [CSVField]
lexCSV qn delim =
getFields qn delim
(CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1}) (1,1)
getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields q d state begin bs0
= case BS.uncons bs0 of
Nothing -> []
Just ('"', bs1) -> doStringFieldContent q d (incTextCol 1 state) begin
BS.empty bs1
_ ->
case BS.break interestingChar bs0 of
(fieldBs, bs1) ->
let field = mkField end begin fieldBs False
end = incTextCol (len-1) $ state
state' = incTableCol $ incTextCol 2 end
stateNL = incTableRow . incTextRow $ state
len = fromIntegral $ BS.length fieldBs
in case BS.uncons bs1 of
Just (c,bs2)
| c==d -> field: getFields q d state' (here state') bs2
Just ('\r',bs2) ->
case BS.uncons bs2 of
Just ('\n',bs3)
-> field: getFields q d stateNL (here stateNL) bs3
-- XXX This could be an error instead:
_ -> field: getFields q d stateNL (here stateNL) bs2
Just ('\n',bs2) -> field: getFields q d stateNL (here stateNL) bs2
Just ('"', _) -> field:
mkError state' begin
"unexpected quote, resync at EOL":
getFields q d stateNL (here stateNL)
(BS.dropWhile (/='\n') bs1)
Just _ -> [mkError state' begin "XXX Can't happen"]
Nothing -> field: getFields q d stateNL (here stateNL) bs1
where interestingChar '\r' = True
interestingChar '\n' = True
interestingChar '"' = True
interestingChar c | c==d = True
interestingChar _ = False
doStringFieldContent :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString
-> ByteString -> [CSVField]
doStringFieldContent q d state begin acc bs1
= case BS.break interestingCharInsideString bs1 of
(newBs, bs2) ->
let fieldBs = acc `BS.append` newBs
field = mkField end begin fieldBs True
end = incTextCol (len-1) state
state' = incTableCol $ incTextCol 3 end
stateNL = incTableRow . incTextRow $ state
len = fromIntegral $ BS.length newBs
in case BS.uncons bs2 of
Just ('\r',bs3) ->
case BS.uncons bs3 of
Just ('\n',bs4) | q ->
doStringFieldContent q d (incTextRow end) begin
(fieldBs `BS.append` BS.singleton '\n') bs4
_ -> doStringFieldContent q d end begin
(fieldBs `BS.append` BS.singleton '\r') bs3
Just ('\n',bs3) | q ->
doStringFieldContent q d (incTextRow end) begin
(fieldBs `BS.append` BS.singleton '\n') bs3
Just ('\n',bs3) ->
field:
mkError end begin "Found newline within quoted field":
getFields q d stateNL (here stateNL) bs3
Just ('"', bs3) ->
case BS.uncons bs3 of
Just (c,bs4)
| c==d -> field: getFields q d state' (here state') bs4
Just ('\r',bs4) ->
case BS.uncons bs4 of
Just ('\n',bs5) ->
field: getFields q d stateNL (here stateNL) bs5
-- XXX This could be an error instead:
_ -> field: getFields q d stateNL (here stateNL) bs4
Just ('\n',bs4) -> field: getFields q d stateNL (here stateNL) bs4
Just ('"',bs4) ->
doStringFieldContent q d (incTextCol 3 end) begin
(fieldBs `BS.append` BS.singleton '"') bs4
Just _ -> field:
mkError state' begin "End-quote not followed by comma":
getFields q d state' (here state') bs3
Nothing -> field: getFields q d stateNL (here stateNL) bs3
Just _ -> [mkError state' begin "XXX Can't happen (string field)"]
Nothing -> field:
mkError state' begin "CSV data ends within a quoted string"
:[]
where interestingCharInsideString '\r' = True
interestingCharInsideString '\n' = True
interestingCharInsideString '"' = True
interestingCharInsideString _ = False
mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField st begin bs q = CSVField { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldContent = bs
, csvFieldQuoted = q }
mkError :: CSVState -> (Int, Int) -> String -> CSVField
mkError st begin e = CSVFieldError { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldError = e }
-- Some pretty-printing for structured CSV errors.
ppCSVError :: CSVError -> String
ppCSVError (err@IncorrectRow{}) =
"\nRow "++show (csvRow err)++" has wrong number of fields."++
"\n Expected "++show (csvColsExpected err)++" but got "++
show (csvColsActual err)++"."++
"\n The fields are:"++
indent 8 (concatMap ppCSVField (csvFields err))
ppCSVError (err@BlankLine{}) =
"\nRow "++show (csvRow err)++" is blank."++
"\n Expected "++show (csvColsExpected err)++" fields."
ppCSVError (err@FieldError{}) = ppCSVField (csvField err)
ppCSVError (NoData{}) =
"\nNo usable data (after accounting for any other errors)."
-- | Pretty-printing for CSV fields, shows positional information in addition
-- to the textual content.
ppCSVField :: CSVField -> String
ppCSVField (f@CSVField{}) =
"\n"++BS.unpack (quoted (csvFieldQuoted f) (csvFieldContent f))++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
ppCSVField (f@CSVFieldError{}) =
"\n"++csvFieldError f++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)
-- | Output a table back to a lazily-constructed string. There are lots of
-- possible design decisions one could take, e.g. to re-arrange columns
-- back into something resembling their original order, but here we just
-- take the given table without looking at Row and Field numbers etc.
ppCSVTable :: CSVTable -> ByteString
ppCSVTable = BS.unlines . map (BS.intercalate (BS.pack ",") . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
-- | Output a table back to a lazily-constructed bytestring, using the given
-- delimiter char.
ppDSVTable :: Char -> CSVTable -> ByteString
ppDSVTable d = BS.unlines . map (BS.intercalate (BS.pack [d]) . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
{-
-- | Output a table back to a string, but using Haskell list-of-tuple notation
-- rather than CSV.
ppCSVTableAsTuples :: CSVTable -> String
ppCSVTableAsTuples = indent 4 . unlines . map ( (", ("++) . (++")")
. intercalate ", " . map ppField )
where ppField f = quoted (csvFieldQuoted f) (BS.unpack (csvFieldContent f))
-}
-- Some pp helpers - indent and quoted - should live elsewhere, in a
-- pretty-printing package.
indent :: Int -> String -> String
indent n = unlines . map (replicate n ' ' ++) . lines
quoted :: Bool -> ByteString -> ByteString
quoted False s = s
quoted True s = BS.concat [BS.pack "\"", escape s, BS.pack"\""]
where escape s = let (good,next) = BS.span (/='"') s
in BS.concat [ good, BS.pack "\"\"", escape next ]
-- | Convert a CSV table to a simpler representation, by dropping all
-- the original location information.
fromCSVTable :: CSVTable -> [[ByteString]]
fromCSVTable = map (map csvFieldContent)
-- | Convert a simple list of lists into a CSVTable by the addition of
-- logical locations. (Textual locations are not so useful.)
-- Rows of varying lengths generate errors. Fields that need
-- quotation marks are automatically marked as such.
toCSVTable :: [[ByteString]] -> ([CSVError], CSVTable)
toCSVTable [] = ([NoData], [])
toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $
unzip (zipWith walk [1..] rows)
where
n = length r
walk :: Int -> [ByteString] -> ([CSVError], CSVRow)
walk rnum [] = ( [blank rnum]
, map (\c-> newField rnum c (BS.empty)) [1..n])
walk rnum cs = ( if length cs /= n then [bad rnum cs] else []
, zipWith (newField rnum) [1..n] cs )
blank rnum = BlankLine{ csvRow = rnum
, csvColsExpected = n
, csvColsActual = 0
, csvField = newField rnum 0 BS.empty
}
bad r cs = IncorrectRow{ csvRow = r
, csvColsExpected = n
, csvColsActual = length cs
, csvFields = zipWith (newField r) [1..] cs
}
-- | Select and/or re-arrange columns from a CSV table, based on names in the
-- header row of the table. The original header row is re-arranged too.
-- The result is either a list of column names that were not present, or
-- the (possibly re-arranged) sub-table.
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields names table
| null table = Left names
| not (null missing) = Left missing
| otherwise = Right (map select table)
where
header = map (BS.unpack . csvFieldContent) (head table)
missing = filter (`notElem` header) names
reordering = map (fromJust . (\n-> elemIndex n header)) names
select fields = map (fields!!) reordering
-- | Validate that the columns of a table have exactly the names and
-- ordering given in the argument.
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields names table
| null table = Left ["CSV table is empty"]
| not (null missing) = Left (map ("CSV table is missing field: "++)
missing)
| header /= names = Left ["CSV columns are in the wrong order"
,"Expected: "++intercalate ", " names
,"Found: "++intercalate ", " header]
| otherwise = Right table
where
header = map (BS.unpack . csvFieldContent) (head table)
missing = filter (`notElem` header) names
-- | A join operator, adds the columns of two tables together.
-- Precondition: the tables have the same number of rows.
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = zipWith (++)
-- | A generator for a new CSV column, of arbitrary length.
-- The result can be joined to an existing table if desired.
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn header = [headField] : map ((:[]).emptyField) [2..]
where
headField = (emptyField 1) { csvFieldContent = BS.pack header
, csvFieldQuoted = True }
emptyField n = CSVField { csvRowNum = n
, csvColNum = 0
, csvTextStart = (0,0)
, csvTextEnd = (0,0)
, csvFieldContent = BS.empty
, csvFieldQuoted = False
}
-- | Generate a fresh field with the given textual content.
-- The quoting flag is set automatically based on the text.
-- Textual extents are not particularly useful, since there was no original
-- input to refer to.
newField :: Int -> Int -> ByteString -> CSVField
newField n c text = CSVField { csvRowNum = n
, csvColNum = c
, csvTextStart = (0,0)
, csvTextEnd = ( fromIntegral
. BS.length
. BS.filter (=='\n')
$ text
, fromIntegral
. BS.length
. BS.takeWhile (/='\n')
. BS.reverse $ text )
, csvFieldContent = text
, csvFieldQuoted = any (`elem`"\",\n\r")
(BS.unpack text)
}