-- Evaluates and finds motifs -- by Chris Bailey-Kellogg, 9/2007 -- modified by Scot Drysdale 1/2008, and further modified 10/2008 to -- remove the use of Maybe when computing scores. -- modified by Scot Drysdale 10/2009 to simplify computeScoreDistrib. -- A motif is a pattern that occurs in a set of strings. Actually, it -- is a pattern where some variations are allowed. For each position -- in the motif there is a score (which could be 0) for each possible -- letter. These scores add up to 1, so they can be viewed as the -- probability that the given letter occurs in that position. -- Given a motif, the question is where in the string can we locate the -- motif to get the highest score. -- Given a set of strings, the question is to find a common motif that -- maximizes the scores over all strings. import List import Random import Monad -- Simple exact matches -- Tests to see if first argument is a prefix of the second argument -- (There's actually already a version in List, but good to show.) isPrefix :: String -> String -> Bool isPrefix [] _ = True isPrefix _ [] = False isPrefix (x:xs) (y:ys) = x==y && isPrefix xs ys -- Test to see if first argument is a substring of the second argument isIn :: String -> String -> Bool isIn _ [] = False isIn m s = isPrefix m s || isIn m (tail s) ------------------------------------------------------------------------ -- Now let's have a probabilistic version: -- each position has a distribution of scores over the letters at that position type Score = Double type ScoreDistrib = [(Char, Score)] type Motif = [ScoreDistrib] m :: Motif m = [[('a', 0.5),('c', 0.3),('g', 0.1),('t', 0.1)], [('a', 0.3),('c', 0.1),('g', 0.1),('t', 0.5)]] s = "tatct" -- Require that letter to be defined in the Motif scoreLetter :: Char -> ScoreDistrib -> Score scoreLetter l ((l2,v):lvs) = if l == l2 then v else scoreLetter l lvs scoreLetter l [] = error "lookup failed" -- BTW, that was really a more general function.... lookupSure k ((k2,v):kvs) = if k == k2 then v else lookupSure k kvs lookupSure k [] = error "lookup failed" -- First attempt at scoring: score each letter in s against distribution -- at matching position in m. score1 :: Motif -> String -> Score score1 m s = product (zipWith scoreLetter s m) -- First attempt at seeing how well the motif matches at different places in s: -- march down s and score each suffix scores1 :: Motif -> String -> [Score] scores1 m s = mapTail (score1 m) s -- Like map, but give the function the whole tail rather than a single element. -- I don't think anything like this is predefined, but it's generally useful. mapTail :: ([t] -> a) -> [t] -> [a] mapTail _ [] = [] mapTail f l@(x:xs) = f l : mapTail f xs -- Problem with that approach: the last few tails are shorter than the motif. -- Thus zipWith does not match the whole motif! -- One fix is to give a 0 score if the string is too short to match the motif. score [] _ = 1 score _ [] = 0 score (sc:scs) (l:ls) = scoreLetter l sc * score scs ls scores m s = mapTail (score m) s ------------------------------------------------------------------------ -- Now let's try to learn a motif from a set of strings -- Assume we know the motif width, and we're dealing with DNA motifs (a,c,g,t) -- If we know where the motif is in each string, we can compute the stats type MotifPosition = (String,Int) -- Simple demo ss = ["aactc","taagc","ggaaa","atcaa"] s2ps :: [MotifPosition] s2ps = zip ss [0,0,0,0] -- Find the motif of width w in list of strings ss, starting the motif in -- a string at the position given in s2ps. -- Note that ss can be a proper subset of the strings whose MotifPositions are -- in s2ps. motifAt :: [String] -> [MotifPosition] -> Int -> Motif motifAt ss s2ps w = map computeScoreDistrib . mytranspose $ map (\s -> substring (lookupSure s s2ps) w s) ss -- Substring of length l starting at position p substring :: Int -> Int -> String -> String substring p l = take l . drop p -- Transpose of a list of lists (lol) -- (There's actually already a version in List, but good to show.) mytranspose ([]:_) = [] mytranspose lol = map head lol : (mytranspose . map tail $ lol) -- Compute the distribution of letters in the string "instances" to get the -- probability that each appears. -- Note that "acgt" is appended to instances so that every letter in the -- alphabet will appear in the final score distribution (even if its count is 0) computeScoreDistrib :: String -> ScoreDistrib computeScoreDistrib instances = let counts = count . sort $ "acgt" ++ instances total = intToFloat . length $ instances in map (\(l,c) -> (l, intToFloat c / total)) counts -- Get the count of each letter. Note that it actually reports one less -- than the number of letters, to compensate for appending the alphabet -- to instance in computeScoreDistrib -- Combine a "run" of equal elements into (element, count) -- Based on problem 9 from 99 problems count :: (Eq t) => [t] -> [(t, Int)] count [] = [] count (x:xs) = let (first,rest) = span ((==) x) xs in (x, length first) : count rest -- From SOE intToFloat n = fromInteger (toInteger n) ------------------------------------------------------------------------ -- Now what if we don't know the motif positions? -- Guess and improve. -- For each string, use the motif extracted from current positions in -- other strings to update the position in this string. -- Not guaranteed to find the optimal solution. -- First version: deterministic -- move to best position -- Returns where the motif is in each string, along with the motif definition. -- ss is the list of strings, w is the width of the motif, and maxiter is the -- maximum number of interations that the function will perform. findMotifDet :: [String] -> Int -> Int -> ([MotifPosition],Motif) findMotifDet ss w maxiter = let loos = leaveOneOut ss iter s2ps niter = let s2ps2 = foldl (\s2ps (s,ss) -> moveBest s ss s2ps w) s2ps loos in if niter == maxiter || sort s2ps == sort s2ps2 then (s2ps, motifAt ss s2ps w) else iter s2ps2 (niter+1) in iter (map (\s -> (s,0)) ss) 0 -- Returns a list of pairs of (element, list with element deleted) for each -- possible choice of element. leaveOneOut :: [a] -> [(a, [a])] leaveOneOut [] = [] leaveOneOut (x:xs) = (x,xs) : (map (\(y,ys) -> (y,x:ys)) (leaveOneOut xs)) -- Given the string to update, the remaining strings, the current positions, -- and the motif width, update the position of the motif in the string moveBest :: String -> [String] -> [MotifPosition] -> Int -> [MotifPosition] moveBest s ss s2ps w = replace s (bestPos (motifAt ss s2ps w) s) s2ps -- In list of (key,value) pairs, substitute v for k's current value replace k v [] = [(k,v)] replace k v ((k2,v2):kvs) = if k == k2 then (k,v):kvs else (k2,v2) : replace k v kvs -- Find the best position for motif m in string s bestPos :: Motif -> String -> Int bestPos m s = snd . fmaximum fst $ zip (scores m s) [0..] -- Maximum of a list, according to a function applied to the members -- In case of ties, takes the first max in the list. -- (This is important if we use 0 scores for places where the remaining -- string is shorter than the pattern, in case all scores are 0.) fmaximum f l = snd . foldr1 (\x y -> if fst x >= fst y then x else y) $ zip (map f l) l -- or (less efficient, since recomputes f) fmaximum2 f l = foldr1 (\x y -> if (f x) >= (f y) then x else y) l ------------------------------------------------------------------------ -- The actual algorithm updates by moving not to the best position, -- but rather to a position chosen randomly with probability according -- to the score. findMotifRand :: [String] -> Int -> Int -> IO ([MotifPosition],Motif) findMotifRand ss w maxiter = let loos = leaveOneOut ss s2ps = map (\s -> (s,0)) ss iter s2ps niter = if niter == maxiter then return (s2ps, motifAt ss s2ps w) else do s2ps2 <- foldM (\s2ps (s,ss) -> moveRandom s ss s2ps w) s2ps loos iter s2ps2 (niter+1) in iter s2ps 0 moveRandom :: String -> [String] -> [MotifPosition] -> Int -> IO [MotifPosition] moveRandom s ss s2ps w = do p <- randomPos (motifAt ss s2ps w) s return (replace s p s2ps) randomPos :: Motif -> String -> IO Int randomPos m s = let sc = scores m s (tot,tots) = mapAccumL (\tot e -> (tot+e, tot+e)) 0 sc in do p <- randomRIO (0::Score, tot) return (snd $ head $ dropWhile (\(t,i) -> t < p) $ zip tots [0..])