-- | The CSV (comma-separated value) format is defined by RFC 4180,
-- \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\",
-- <http://www.rfc-editor.org/rfc/rfc4180.txt>
--
-- This lazy parser can report all CSV formatting errors, whilst also
-- returning all the valid data, so the user can choose whether to
-- continue, to show warnings, or to halt on error.
--
-- Valid fields retain information about their original location in the
-- input, so a secondary parser from textual fields to typed values
-- can give intelligent error messages.
--
-- In a valid CSV file, all rows must have the same number of columns.
-- This parser will flag a row with the wrong number of columns as a error.
-- (But the error type contains the actual data, so the user can recover
-- it if desired.) Completely blank lines are also treated as errors,
-- and again the user is free either to filter these out or convert them
-- to a row of actual null fields.
module Text.CSV.Lazy.String
( -- * CSV types
CSVTable
, CSVRow
, CSVField(..)
-- * CSV parsing
, CSVError(..)
, CSVResult
, csvErrors
, csvTable
, parseCSV
, parseDSV
-- * Pretty-printing
, ppCSVError
, ppCSVField
, ppCSVTable
, ppDSVTable
-- * Conversion between standard and simple representations
, fromCSVTable
, toCSVTable
-- * Selection, validation, and algebra of CSV tables
, selectFields
, expectFields
, mkEmptyColumn
, joinCSV
) where
import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile)
import Data.Function (on)
import Data.Maybe (fromJust)
-- | A CSV table is a sequence of rows. All rows have the same number
-- of fields.
type CSVTable = [CSVRow]
-- | A CSV row is just a sequence of fields.
type CSVRow = [CSVField]
-- | A CSV field's content is stored with its logical row and column number,
-- as well as its textual extent. This information is necessary if you
-- want to generate good error messages in a secondary parsing stage,
-- should you choose to convert the textual fields to typed data values.
data CSVField = CSVField { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldContent :: !String
, csvFieldQuoted :: !Bool }
| CSVFieldError { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldError :: !String }
deriving (Eq,Show)
-- | A structured error type for CSV formatting mistakes.
data CSVError = IncorrectRow { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvFields :: [CSVField] }
| BlankLine { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvField :: CSVField }
| FieldError { csvField :: CSVField }
| NoData
deriving (Eq,Show)
-- | The result of parsing a CSV input is a mixed collection of errors
-- and valid rows. This way of representing things is crucial to the
-- ability to parse lazily whilst still catching format errors.
type CSVResult = [Either [CSVError] CSVRow]
-- | Extract just the valid portions of a CSV parse.
csvTable :: CSVResult -> CSVTable
csvTable r = [ v | Right v <- r ]
-- | Extract just the errors from a CSV parse.
csvErrors :: CSVResult -> [CSVError]
csvErrors r = concat [ v | Left v <- r ]
-- | A first-stage parser for CSV (comma-separated values) data.
-- The individual fields remain as text, but errors in CSV formatting
-- are reported. Errors (containing unrecognisable rows/fields) are
-- interspersed with the valid rows/fields.
parseCSV :: String -> CSVResult
parseCSV = parseDSV True ','
-- | Sometimes CSV is not comma-separated, but delimiter-separated
-- values (DSV). The choice of delimiter is arbitrary, but semi-colon
-- is common in locales where comma is used as a decimal point, and tab
-- is also common. The Boolean argument is
-- whether newlines should be accepted within quoted fields. The CSV RFC
-- says newlines can occur in quotes, but other DSV formats might say
-- otherwise. You can often get better error messages if newlines are
-- disallowed.
parseDSV :: Bool -> Char -> String -> CSVResult
parseDSV qn delim = validate
. groupBy ((==)`on`csvRowNum)
. lexCSV qn delim
validate :: [CSVRow] -> CSVResult
validate [] = [Left [NoData]]
validate xs@(x:_) = map (extractErrs (length x)) xs
extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs size row
| length row0 == size && null errs0 = Right row0
| length row0 == 1 && empty field0 = Left [blankLine field0]
| otherwise = Left (map convert errs0
++ validateColumns row0)
where
(row0,errs0) = partition isField row
(field0:_) = row0
isField (CSVField{}) = True
isField (CSVFieldError{}) = False
empty f@(CSVField{}) = null (csvFieldContent f)
empty _ = False
convert err = FieldError {csvField = err}
validateColumns r =
if length r == size then []
else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r)
, csvColsExpected = size
, csvColsActual = length r
, csvFields = r } ]
blankLine f = BlankLine{ csvRow = csvRowNum f
, csvColsExpected = size
, csvColsActual = 1
, csvField = f }
-- Reading CSV data is essentially lexical, and can be implemented with a
-- simple finite state machine. We keep track of logical row number,
-- logical column number (in tabular terms), and textual position (row,col)
-- to enable good error messages.
-- Positional data is retained even after successful lexing, in case a
-- second-stage field parser wants to complain.
--
-- A double-quoted CSV field may contain commas, newlines, and double quotes.
data CSVState = CSVState { tableRow, tableCol :: !Int
, textRow, textCol :: !Int }
incTableRow, incTableCol, incTextRow, incTextCol :: CSVState -> CSVState
incTableRow st = st { tableRow = tableRow st + 1 }
incTableCol st = st { tableCol = tableCol st + 1 }
incTextRow st = st { textRow = textRow st + 1 }
incTextCol st = st { textCol = textCol st + 1 }
-- Lexer is a small finite state machine.
lexCSV :: Bool -> Char -> [Char] -> [CSVField]
lexCSV quotedNewline delim =
simple (CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1}) (1,1) []
where
-- 'simple' recognises an unquoted field, and delimiter char as separator
simple :: CSVState -> (Int,Int) -> String -> String -> [CSVField]
simple _ _ [] [] = []
simple s begin acc [] = mkField s begin acc False : []
simple s begin acc (c:cs)
| not (interesting c) = simple (incTextCol $! s) begin (c:acc) cs
simple s begin acc (c:'"':cs)
| c==delim = mkField s begin acc False :
string s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTableCol $! s
simple s begin acc (c:cs)
| c==delim = mkField s begin acc False :
simple s' (textRow s',textCol s') [] cs
where s' = incTableCol . incTextCol $! s
simple s begin acc ('\r':'\n':cs)
= mkField s begin acc False :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
simple s begin acc ('\n' :cs) = mkField s begin acc False :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
simple s begin [] ('"' :cs) = string (incTextCol $! s) begin [] cs
simple s begin acc ('"' :cs) = mkError s begin
"Start-quote not next to comma":
string (incTextCol $! s) begin acc cs
-- 'string' recognises a double-quoted field containing commas and newlines
string :: CSVState -> (Int,Int) -> String -> String -> [CSVField]
string s begin [] [] = mkError s begin "Data ends at start-quote":
[]
string s begin acc [] = mkError s begin "Data ends in quoted field":
[]
string s begin acc (c:cs)
| not (interestingInString c) = string (incTextCol $! s) begin (c:acc) cs
string s begin acc ('"':'"':cs) = string s' begin ('"':acc) cs
where s' = incTextCol . incTextCol $! s
string s begin acc ('"':c:'"':cs)
| c==delim = mkField s begin acc True :
string s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTextCol . incTableCol $! s
string s begin acc ('"':c:cs)
| c==delim = mkField s begin acc True :
simple s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTableCol $! s
string s begin acc ('"':'\n':cs)= mkField s begin acc True :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
string s begin acc ('"':'\r':'\n':cs)
= mkField s begin acc True :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
string s begin acc ('"':cs) = mkError s begin
"End-quote not followed by comma":
simple (incTextCol $! s) begin acc cs
string s begin acc ('\r':'\n':cs)
| quotedNewline = string s' begin ('\n':acc) cs
| otherwise = mkError s begin
"Found newline within quoted field":
simple s'' (textRow s'',textCol s'') [] cs
where s' = incTextRow $! s {textCol=1}
s'' = incTableRow . incTextRow $!
s {textCol=1, tableCol=1}
string s begin acc ('\n' :cs)
| quotedNewline = string s' begin ('\n':acc) cs
| otherwise = mkError s begin
"Found newline within quoted field":
simple s'' (textRow s'',textCol s'') [] cs
where s' = incTextRow $! s {textCol=1}
s'' = incTableRow . incTextRow $!
s {textCol=1, tableCol=1}
interesting :: Char -> Bool
interesting '\n' = True
interesting '\r' = True
interesting '"' = True
interesting c = c==delim
interestingInString :: Char -> Bool
interestingInString '\n' = True
interestingInString '\r' = True
interestingInString '"' = True
interestingInString _ = False
-- generate the lexical tokens representing either a field or an error
mkField st begin f q = CSVField { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldContent = reverse f
, csvFieldQuoted = q }
mkError st begin e = CSVFieldError { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldError = e }
-- | Some pretty-printing for structured CSV errors.
ppCSVError :: CSVError -> String
ppCSVError (err@IncorrectRow{}) =
"\nRow "++show (csvRow err)++" has wrong number of fields."++
"\n Expected "++show (csvColsExpected err)++" but got "++
show (csvColsActual err)++"."++
"\n The fields are:"++
indent 8 (concatMap ppCSVField (csvFields err))
ppCSVError (err@BlankLine{}) =
"\nRow "++show (csvRow err)++" is blank."++
"\n Expected "++show (csvColsExpected err)++" fields."
ppCSVError (err@FieldError{}) = ppCSVField (csvField err)
ppCSVError (err@NoData{}) =
"\nNo usable data (after accounting for any other errors)."
-- | Pretty-printing for CSV fields, shows positional information in addition
-- to the textual content.
ppCSVField :: CSVField -> String
ppCSVField (f@CSVField{}) =
"\n"++quoted (csvFieldQuoted f) (csvFieldContent f)++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
ppCSVField (f@CSVFieldError{}) =
"\n"++csvFieldError f++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
-- | Turn a full CSV table back into text, as much like the original
-- input as possible, e.g. preserving quoted/unquoted format of fields.
ppCSVTable :: CSVTable -> String
ppCSVTable = unlines . map (intercalate "," . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
-- | Turn a full CSV table back into text, using the given delimiter
-- character. Quoted/unquoted formatting of the original is preserved.
ppDSVTable :: Char -> CSVTable -> String
ppDSVTable delim = unlines . map (intercalate [delim] . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
-- Some pp helpers - indent and quoted - should live elsewhere, in a
-- pretty-printing package.
indent :: Int -> String -> String
indent n = unlines . map (replicate n ' ' ++) . lines
quoted :: Bool -> String -> String
quoted False s = s
quoted True s = '"': escape s ++"\""
where escape ('"':cs) = '"':'"': escape cs
escape (c:cs) = c: escape cs
escape [] = []
-- | Convert a CSV table to a simpler representation, by dropping all
-- the original location information.
fromCSVTable :: CSVTable -> [[String]]
fromCSVTable = map (map csvFieldContent)
-- | Convert a simple list of lists into a CSVTable by the addition of
-- logical locations. (Textual locations are not so useful.)
-- Rows of varying lengths generate errors. Fields that need
-- quotation marks are automatically marked as such.
toCSVTable :: [[String]] -> ([CSVError], CSVTable)
toCSVTable [] = ([NoData], [])
toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $
unzip (zipWith walk [1..] rows)
where
n = length r
walk :: Int -> [String] -> ([CSVError], CSVRow)
walk rnum [] = ( [blank rnum]
, map (\c-> newField rnum c "") [1..n])
walk rnum cs = ( if length cs /= n then [bad rnum cs] else []
, zipWith (newField rnum) [1..n] cs )
blank rnum = BlankLine{ csvRow = rnum
, csvColsExpected = n
, csvColsActual = 0
, csvField = newField rnum 0 ""
}
bad r cs = IncorrectRow{ csvRow = r
, csvColsExpected = n
, csvColsActual = length cs
, csvFields = zipWith (newField r) [1..] cs
}
-- | Select and/or re-arrange columns from a CSV table, based on names in the
-- header row of the table. The original header row is re-arranged too.
-- The result is either a list of column names that were not present, or
-- the (possibly re-arranged) sub-table.
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields names table
| null table = Left names
| not (null missing) = Left missing
| otherwise = Right (map select table)
where
header = map csvFieldContent (head table)
missing = filter (`notElem` header) names
reordering = map (fromJust . (\n-> elemIndex n header)) names
select fields = map (fields!!) reordering
-- | Validate that the named columns of a table have exactly the names and
-- ordering given in the argument.
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields names table
| null table = Left ["CSV table is empty"]
| not (null missing) = Left (map ("CSV table is missing field: "++)
missing)
| header /= names = Left ["CSV columns are in the wrong order"
,"Expected: "++intercalate ", " names
,"Found: "++intercalate ", " header]
| otherwise = Right table
where
header = map csvFieldContent (head table)
missing = filter (`notElem` header) names
-- | A join operator, adds the columns of two tables together.
-- Precondition: the tables have the same number of rows.
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = zipWith (++)
-- | A generator for a new CSV column, of arbitrary length.
-- The result can be joined to an existing table if desired.
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn header = [newField 1 0 header] :
map (\n-> [newField n 0 ""]) [2..]
-- | Generate a fresh field with the given textual content.
-- The quoting flag is set automatically based on the text.
-- Textual extents are not particularly useful, since there was no original
-- input to refer to.
newField :: Int -> Int -> String -> CSVField
newField n c text = CSVField { csvRowNum = n
, csvColNum = c
, csvTextStart = (0,0)
, csvTextEnd = (length (filter (=='\n') text)
,length . takeWhile (/='\n')
. reverse $ text )
, csvFieldContent = text
, csvFieldQuoted = any (`elem`"\",\n\r") text
}