1 {-# LANGUAGE OverloadedStrings #-} 2 module Data.Csv.Contrib 3 ( getHeaders 4 , extractColumnsInitialRows 5 , extractColumnsDistinctSample 6 , extractColumnDefaulting 7 , extractColumn 8 , decodeCsvByNameWith 9 , parseCsvWithHeader 10 , removeBomPrefixText 11 -- for testing only 12 , removeBomPrefix 13 , repairDuplicateLineEndings 14 , repairCarriageReturnOnly 15 ) where 16 17 import qualified Data.Attoparsec.ByteString as ATTO 18 import qualified Data.ByteString as BS 19 import qualified Data.ByteString.Char8 as BSC 20 import qualified Data.ByteString.Lazy as BSL 21 import qualified Data.ByteString.Search as Search 22 import qualified Data.Csv as Csv 23 import qualified Data.Csv.Parser as Csv 24 import qualified Data.HashMap.Strict as HMP 25 import qualified Data.List as L 26 import qualified Data.Maybe as MB 27 import qualified Data.Text.Lazy as TL 28 import qualified Data.Vector as V 29 import Data.Vector (Vector) 30 31 getHeaders :: Csv.Header -> [BS.ByteString] 32 getHeaders = V.toList 33 34 extractColumnsInitialRows :: Int -> Csv.Header -> Vector Csv.NamedRecord -> [(BS.ByteString, [BS.ByteString])] 35 extractColumnsInitialRows maxRows hdrs records = 36 zip 37 hdrs' 38 (fmap (\hdr -> extractColumnDefaulting hdr truncatedRecords) hdrs') 39 where 40 truncatedRecords = V.take maxRows records 41 hdrs' :: [BS.ByteString] 42 hdrs' = getHeaders hdrs 43 44 extractColumnsDistinctSample :: Int -> Csv.Header -> Vector Csv.NamedRecord -> [(BS.ByteString, [BS.ByteString])] 45 extractColumnsDistinctSample maxSamples hdrs records = 46 zip hdrs' 47 ( fmap 48 ( getSample 49 . (\hdr -> extractColumnDefaulting hdr records)) 50 hdrs' ) 51 where 52 getSample :: [BS.ByteString] -> [BS.ByteString] 53 getSample vals = 54 (take maxSamples . L.nub) vals 55 hdrs' :: [BS.ByteString] 56 hdrs' = getHeaders hdrs 57 58 extractColumnDefaulting :: BS.ByteString -> Vector Csv.NamedRecord -> [BS.ByteString] 59 extractColumnDefaulting hdr records = 60 extractColumn hdr records (maybe "" id) 61 62 extractColumn :: BS.ByteString -> Vector Csv.NamedRecord -> (Maybe BS.ByteString -> a) -> [a] 63 extractColumn hdr records applyDefault = 64 ( V.toList 65 . fmap (\rowMap -> (applyDefault . HMP.lookup hdr) rowMap)) 66 records 67 68 -- similar to decodeByName with except make parser parameter explicity 69 decodeCsvByNameWith :: (Csv.NamedRecord -> Csv.Parser a) -> BS.ByteString -> Either String (Csv.Header, Vector a) 70 decodeCsvByNameWith parseNamedRecord' contents = do 71 -- Csv.decodeByNameWith Csv.defaultDecodeOptions contents 72 (hdr, rcrds) <- parseCsvWithHeader contents 73 let nRcrds = fmap Csv.toNamedRecord rcrds 74 let rcrdsParser = traverse parseNamedRecord' nRcrds 75 vals <- Csv.runParser rcrdsParser 76 pure (hdr, vals) 77 78 parseCsvWithHeader :: BS.ByteString -> Either String (Csv.Header, Vector Csv.NamedRecord) 79 parseCsvWithHeader contents = 80 runCsvParser ATTO.parseOnly contents 81 82 runCsvParser 83 :: (ATTO.Parser (Csv.Header, Vector Csv.NamedRecord) -> BS.ByteString -> Either String (Csv.Header, Vector Csv.NamedRecord)) 84 -> BS.ByteString 85 -> Either String (Csv.Header, Vector Csv.NamedRecord) 86 runCsvParser parse contents = do 87 (hdrs, rows) <- parse 88 (Csv.csvWithHeader Csv.defaultDecodeOptions) 89 ((repairCarriageReturnOnly . repairDuplicateLineEndings . removeBomPrefix) contents) 90 pure (hdrs, scrub rows) 91 92 scrub :: Vector Csv.NamedRecord -> Vector Csv.NamedRecord 93 scrub rows = 94 fmap scrubRow rows 95 where 96 scrubRow :: Csv.NamedRecord -> Csv.NamedRecord 97 scrubRow row = 98 HMP.map (\v -> if (BSC.all (== ' ') v) then "" else v) row 99 100 -- | some programs introduce a byte order mark when generating a CSV, remove this per cassava issue recipe 101 removeBomPrefix :: BS.ByteString -> BS.ByteString 102 removeBomPrefix contents = 103 MB.fromMaybe contents (BS.stripPrefix "\357\273\277" contents) 104 105 -- | some programs introduce a byte order mark when generating a CSV, remove this per cassava issue recipe 106 removeBomPrefixText :: TL.Text -> TL.Text 107 removeBomPrefixText contents = 108 case TL.uncons contents of 109 Just ('\65279', rs) -> rs 110 _ -> contents 111 112 -- | fix duplicate line endings, unclear if SPSS or Excel introduces them 113 repairDuplicateLineEndings :: BS.ByteString -> BS.ByteString 114 repairDuplicateLineEndings contents = 115 BSL.toStrict (Search.replace "\r\r\n" ("\r\n" :: BS.ByteString) contents) 116 117 -- | only fix newlines for bizarre macOS endings that use \r instead of \r\n 118 repairCarriageReturnOnly :: BS.ByteString -> BS.ByteString 119 repairCarriageReturnOnly contents = 120 let 121 hasNewline = BSC.elem '\n' contents 122 in 123 if hasNewline 124 then contents 125 else BSC.concatMap (\c -> if c == '\r' then "\r\n" else BSC.singleton c) contents