1 {-# LANGUAGE OverloadedStrings #-}
    2 module Data.Csv.Contrib
    3   ( getHeaders
    4   , extractColumnsInitialRows
    5   , extractColumnsDistinctSample
    6   , extractColumnDefaulting
    7   , extractColumn
    8   , decodeCsvByNameWith
    9   , parseCsvWithHeader
   10   , removeBomPrefixText
   11   -- for testing only
   12   , removeBomPrefix
   13   , repairDuplicateLineEndings
   14   , repairCarriageReturnOnly
   15   ) where
   16 
   17 import qualified Data.Attoparsec.ByteString as ATTO
   18 import qualified Data.ByteString as BS
   19 import qualified Data.ByteString.Char8 as BSC
   20 import qualified Data.ByteString.Lazy as BSL
   21 import qualified Data.ByteString.Search as Search
   22 import qualified Data.Csv as Csv
   23 import qualified Data.Csv.Parser as Csv
   24 import qualified Data.HashMap.Strict as HMP
   25 import qualified Data.List as L
   26 import qualified Data.Maybe as MB
   27 import qualified Data.Text.Lazy as TL
   28 import qualified Data.Vector as V
   29 import Data.Vector (Vector)
   30 
   31 getHeaders :: Csv.Header -> [BS.ByteString]
   32 getHeaders = V.toList
   33 
   34 extractColumnsInitialRows :: Int -> Csv.Header -> Vector Csv.NamedRecord -> [(BS.ByteString, [BS.ByteString])]
   35 extractColumnsInitialRows maxRows hdrs records =
   36     zip
   37         hdrs'
   38         (fmap (\hdr -> extractColumnDefaulting hdr truncatedRecords) hdrs')
   39   where
   40     truncatedRecords = V.take maxRows records
   41     hdrs' :: [BS.ByteString]
   42     hdrs' = getHeaders hdrs
   43 
   44 extractColumnsDistinctSample :: Int -> Csv.Header -> Vector Csv.NamedRecord -> [(BS.ByteString, [BS.ByteString])]
   45 extractColumnsDistinctSample maxSamples hdrs records =
   46     zip hdrs'
   47         ( fmap
   48               ( getSample
   49               . (\hdr -> extractColumnDefaulting hdr records))
   50               hdrs' )
   51   where
   52     getSample :: [BS.ByteString] -> [BS.ByteString]
   53     getSample vals =
   54         (take maxSamples . L.nub) vals
   55     hdrs' :: [BS.ByteString]
   56     hdrs' = getHeaders hdrs
   57 
   58 extractColumnDefaulting :: BS.ByteString -> Vector Csv.NamedRecord -> [BS.ByteString]
   59 extractColumnDefaulting hdr records =
   60    extractColumn hdr records (maybe "" id)
   61 
   62 extractColumn :: BS.ByteString -> Vector Csv.NamedRecord -> (Maybe BS.ByteString -> a) -> [a]
   63 extractColumn hdr records applyDefault =
   64    ( V.toList
   65    . fmap (\rowMap -> (applyDefault . HMP.lookup hdr) rowMap))
   66    records
   67 
   68 -- similar to decodeByName with except make parser parameter explicity
   69 decodeCsvByNameWith :: (Csv.NamedRecord -> Csv.Parser a) -> BS.ByteString -> Either String (Csv.Header, Vector a)
   70 decodeCsvByNameWith parseNamedRecord' contents = do
   71     -- Csv.decodeByNameWith Csv.defaultDecodeOptions contents
   72     (hdr, rcrds) <- parseCsvWithHeader contents
   73     let nRcrds = fmap Csv.toNamedRecord rcrds
   74     let rcrdsParser = traverse parseNamedRecord' nRcrds
   75     vals <- Csv.runParser rcrdsParser
   76     pure (hdr, vals)
   77 
   78 parseCsvWithHeader :: BS.ByteString -> Either String (Csv.Header, Vector Csv.NamedRecord)
   79 parseCsvWithHeader contents =
   80     runCsvParser ATTO.parseOnly contents
   81 
   82 runCsvParser
   83     :: (ATTO.Parser (Csv.Header, Vector Csv.NamedRecord) -> BS.ByteString -> Either String (Csv.Header, Vector Csv.NamedRecord))
   84     -> BS.ByteString
   85     -> Either String (Csv.Header, Vector Csv.NamedRecord)
   86 runCsvParser parse contents = do
   87     (hdrs, rows) <- parse
   88           (Csv.csvWithHeader Csv.defaultDecodeOptions)
   89           ((repairCarriageReturnOnly . repairDuplicateLineEndings . removeBomPrefix) contents)
   90     pure (hdrs, scrub rows)
   91 
   92 scrub :: Vector Csv.NamedRecord -> Vector Csv.NamedRecord
   93 scrub rows =
   94     fmap scrubRow rows
   95   where
   96     scrubRow :: Csv.NamedRecord -> Csv.NamedRecord
   97     scrubRow row =
   98         HMP.map (\v -> if (BSC.all (== ' ') v) then "" else v) row
   99 
  100 -- | some programs introduce a byte order mark when generating a CSV, remove this per cassava issue recipe
  101 removeBomPrefix :: BS.ByteString -> BS.ByteString
  102 removeBomPrefix contents =
  103     MB.fromMaybe contents (BS.stripPrefix "\357\273\277" contents)
  104 
  105 -- | some programs introduce a byte order mark when generating a CSV, remove this per cassava issue recipe
  106 removeBomPrefixText :: TL.Text -> TL.Text
  107 removeBomPrefixText contents =
  108     case TL.uncons contents of
  109         Just ('\65279', rs) -> rs
  110         _ -> contents
  111 
  112 -- | fix duplicate line endings, unclear if SPSS or Excel introduces them
  113 repairDuplicateLineEndings :: BS.ByteString -> BS.ByteString
  114 repairDuplicateLineEndings contents =
  115     BSL.toStrict (Search.replace "\r\r\n" ("\r\n" :: BS.ByteString) contents)
  116 
  117 -- | only fix newlines for bizarre macOS endings that use \r instead of \r\n
  118 repairCarriageReturnOnly :: BS.ByteString -> BS.ByteString
  119 repairCarriageReturnOnly contents =
  120     let
  121         hasNewline = BSC.elem '\n' contents
  122     in
  123         if hasNewline
  124         then contents
  125         else BSC.concatMap (\c -> if c == '\r' then "\r\n" else BSC.singleton c) contents