import Control.Arrow import Control.Monad (when) import Data.Char import Data.List import Data.Maybe ( fromMaybe ) import qualified Data.Map as Map import qualified Data.Set as Set import Data.Function import Ngrams import System.Environment import System.Console.GetOpt import Text.Tabular import Text.Tabular.AsciiArt as TA data DisplayCountsAs = SigDigits Int | Log Int | None deriving Show data Options = Options { optNgramSize :: Int , optCountMode :: DisplayCountsAs } deriving Show defaultOptions = Options { optNgramSize = 2 , optCountMode = SigDigits 1 } options :: [OptDescr (Options -> Options)] options = [ Option [] ["none"] (NoArg (setCountMode None)) "no counts" , Option [] ["log"] (OptArg (setCountMode . Log . fromMaybeArg 10) "BASE") "log BASE (10 by default)" , Option [] ["sigdigits"] (OptArg (setCountMode . SigDigits . fromMaybeArg 1) "INT") "INT sig digits (1 by default)" , Option [] ["ngrams"] (ReqArg (\x o -> o { optNgramSize = readOrArgError x }) "INT") "INT-grams" ] where setCountMode x opt = opt { optCountMode = x } main :: IO () main = do argv <- getArgs (opts,ns) <- case getOpt Permute options argv of (o,n,[] ) -> return (foldl (flip id) defaultOptions o, n) (_,_,errs) -> failWith (unlines errs) case ns of [f1, f2] -> goAll opts [f1,f2] _ -> failWith "need 2 files to compare" where failWith errs = do pname <- getProgName let header = "Usage: " ++ pname ++ " [OPTION] file1 file2" fail (errs ++ "\n" ++ usageInfo header options) goAll :: Options -> [FilePath] -> IO () goAll opts fns = do ms <- mapM (fmap (toNgramMap ng_sz) . readFile) fns let basicTable = ngramTable ms putStrLn . TA.render show . toTabular fns . filterInteresting --- XXX : should be parameterisable $ basicTable -- let doBlock n = title n ++ "\n" ++ -- concatMap doBlock [1..3] return () where ng_sz = optNgramSize opts -- title n = "== " ++ show n ++ "-grams ==" filterInteresting m = foldr filter m [ interesting_logdiff, interesting_ten ] interesting_ten (_, xs) = any (> 100) xs interesting_logdiff (_, xs) = case map (logBaseN 2) xs of [] -> False (y:ys) -> any (\z -> abs (z - y) > 1) ys toTabular fns kvs = Table (Group NoLine row_hs) (Group NoLine col_hs) cols where col_hs = map Header fns (row_hs, cols) = unzip $ map (first ngramH) $ kvs ngramH = Header . unwords type Ngram = [String] showG :: DisplayCountsAs -> (Ngram, Int) -> String showG cmode (s,i) = intercalate "\t" $ unwords s : count where count = case cmode of None -> [] SigDigits n -> [show $ sigdigits n i] Log n -> [show $ logBase10 i] -- | 'sigdigits' @d n@ rounds @n@ to @d@ significant digits sigdigits :: Int -> Int -> Int sigdigits d n = tens * (n `div` tens) where base = case logBase10 n - d + 1 of x | x > 0 -> x | otherwise -> 0 tens = 10 ^ base logBase10 :: Int -> Int logBase10 = logBaseN 10 logBaseN :: Float -> Int -> Int logBaseN b = floor . logBase b . fromIntegral -- XXX - I have a feeling this could be much more elegant ngramTable ms = map keyPair allKeys where allKeys = Set.toAscList . foldr Set.union Set.empty . map Map.keysSet $ ms keyPair k = (k, map (keyVal k) ms) keyVal k = Map.findWithDefault 0 k toNgramMap n = histogram . allgrams n . lines -- | grams @n ss@ gets all @n@-grams in all -- sentences... note that we take the ngrams separately -- for each sentence allgrams :: Int -> [String] -> [Ngram] allgrams n = concatMap (ngrams n . words) histogram :: Ord a => [a] -> Map.Map a Int histogram xs = Map.fromListWith (+) $ zip xs (repeat 1) -- ---------------------------------------------------------------------- -- helpers -- ---------------------------------------------------------------------- fromMaybeArg :: Read a => a -> Maybe String -> a fromMaybeArg d ms = maybe d readOrArgError ms readOrArgError :: Read a => String -> a readOrArgError s = case maybeRead s of Nothing -> error $ "Could not read argument " ++ s Just x -> x maybeRead :: Read a => String -> Maybe a maybeRead s = case reads s of [(x, s')] | all isSpace s' -> Just x _ -> Nothing