-- Program to accompany JFP Functional Pearl
-- "Enumerating the strings of a regular language"
-- M. Douglas McIlroy
-- The code in the paper works with Hugs 1.4; the code here has
-- been updated to work with Hugs 98 as well. Changes, flagged
-- H98 and H1.4, are necessary because
-- "Word" conflicts with H98 standard prelude,
-- ++ is not overloadable in H98.
infixr 5 +++ -- catenate LOL data -- H98 (++ in H1.4)
infixl 6 \/ -- set union
-- SET OPERATIONS
(\/) :: Ord a => [a] -> [a] -> [a]
xprod :: (Ord a, Ord b, Ord c) => (a->b->c) -> [a] -> [b] -> [c]
closure :: Ord a => (a->a->a) -> a -> [a] -> [a]
[] \/ ys = ys
xs \/ [] = xs
xs@(x:xt) \/ ys@(y:yt) = case compare x y of
LT -> x : xt\/ys
EQ -> x : xt\/yt
GT -> y : xs\/yt
xprod _ [] _ = []
xprod _ _ [] = []
xprod f (x:xt) ys@(y:yt) =
(f x y) : (xprod f [x] yt) \/ (xprod f xt ys)
closure f z [] = [z]
closure f z xs@(x:xt) = if x==z
then closure f z xt
else z : xprod f xs (closure f z xs)
-- GENERATE LANGUAGE FROM REGULAR EXPRESSION
data LOL a = LOL [a] -- length-ordered list
deriving (Eq, Show)
instance Ord a => Ord (LOL a) where
LOL x <= LOL y = (length x, x) <= (length y, y)
(+++) :: LOL a -> LOL a -> LOL a -- H98
LOL x +++ LOL y = LOL (x++y)
-- instance Monad LOL -- H1.4
-- instance MonadZero LOL
-- instance MonadPlus LOL where
-- (LOL x) ++ (LOL y) = LOL (x++y)
type LOS = LOL Char -- length-ordered string
data Rexp = Nil -- empty language
| Eps -- empty string
| Sym Char -- symbol of the alphabet
| Clo Rexp -- Kleene closure
| Cat Rexp Rexp -- catenation
| Alt Rexp Rexp -- alternation
deriving (Show,Eq,Ord)
enumR :: Rexp -> [String]
enumR r = [x | (LOL x) <- enumR' r]
enumR' :: Rexp -> [LOS]
enumR' Nil = []
enumR' Eps = [LOL ""]
enumR' (Sym a) = [LOL [a]]
enumR' (Clo x) = clo (enumR' x)
enumR' (Cat x y) = cat (enumR' x) (enumR' y)
enumR' (Alt x y) = alt (enumR' x) (enumR' y)
alt, cat :: [LOS] -> [LOS] -> [LOS]
clo :: [LOS] -> [LOS]
alt = (\/)
cat = xprod (+++) -- H98
clo = closure (+++) (LOL "")
-- CONSTRUCT AUTOMATON FROM REGULAR EXPRESSION
-- Eliminate Nil from all but top level.
deNil :: Rexp -> Rexp
deNil (Cat x y) = case (deNil x, deNil y) of
(Nil, _) -> Nil
(_, Nil) -> Nil
(x', y') -> Cat x' y'
deNil (Alt x y) = case (deNil x, deNil y) of
(Nil, y') -> y'
(x', Nil) -> x'
(x', y') -> Alt x' y'
deNil (Clo x) = case deNil x of
Nil -> Eps
x' -> Clo x'
deNil x = x
-- A nondeterministic finite automaton (NFA)
-- is known by the set of states it starts in.
-- A single state accepts a single character and
-- moves to a subautomaton (i.e. to a set of states).
-- Each state has a distinguishing (Int) identifier.
-- The unique final state has identifier 0, a dummy
-- accepted character, and no moves: (State 0 '~' []).
type NFA = [State]
data State = State Ident Char NFA
type Ident = Int
accept :: NFA -> Bool
accept ds = 0 `elem` [i | (State i _ _) <- ds]
-- States are ordered by character then identifier;
-- sets of states are kept in order.
instance Eq State where
(State i _ _) == (State i' _ _) = i==i'
instance Ord State where
(State i c _) <= (State i' c' _) = (c,i) <= (c',i')
-- Return destination states ds if bypass flag is
-- true, else empty list.
bp :: Bool -> NFA -> NFA
bp True ds = ds
bp False _ = []
r2n :: Rexp -> NFA -- reg expr to nondet automaton
r2n r = let {
ds = [State 0 '~' []];
(fs, _, b) = r2n' r 1 ds
} in fs \/ (bp b ds)
r2n' :: Rexp -> Ident -> NFA -> (NFA,Ident,Bool)
-- (fs,n',b) = r2n' r n ds
-- r regular expression
-- n number available as identifier
-- ds destination states
-- fs first states (states in r that accept chars)
-- b bypass flag (true if r's language includes Eps)
-- n' next number available as identifier
r2n' Nil n _ = ([], n, False)
r2n' Eps n ds = ([], n, True)
r2n' (Sym c) n ds = ([State n c ds], succ n, False)
r2n' (Cat x y) n ds = let {
(fs, n', b) = r2n' y n ds;
(fs', n'', b') = r2n' x n' (fs\/(bp b ds));
} in (fs'\/(bp b' fs), n'', b&&b')
r2n' (Alt x y) n ds = let {
(fs, n', b) = r2n' y n ds;
(fs', n'', b') = r2n' x n' ds;
} in (fs\/fs', n'', b||b')
r2n' (Clo x) n ds = let {
(fs, n', b) = r2n' x n (fs\/ds)
} in (fs, n', True)
-- GENERATE LANGUAGE FROM AUTOMATON
enumA :: NFA -> [String]
enumA starts = visit [("",starts)]
-- type Word = (String,NFA) -- H1.4
-- visit :: [Word] -> [String]
visit :: [(String,NFA)] -> [String] -- H98
visit [] = []
visit ((x,ds):ws) = let { xs = visit (ws ++
[(x++[c],ds') | (State _ c ds') <- grp ds])
} in if accept ds then x:xs else xs
-- Group moves by character (states with identical chars
-- are adjacent in a state list). A group is repesented
-- as a (possibly fictitious) state. The Ident of a group
-- is never looked at, allowing -1 to be used repeatedly.
grp :: NFA -> NFA
grp (m@(State _ c ds) : ms@((State _ c' ds'):mt)) =
if c==c' then grp ((State (-1) c (ds\/ds')):mt)
else m : grp ms
grp ms = ms
-- Enumerate strings of a regular expression by automaton.
enumRA :: Rexp -> [String]
enumRA = enumA . r2n . deNil
-- INSPECTION AND TEST
-- Create a pair of printable lists that describe an NFA:
-- identifiers of start states
-- state descriptions (identifier, character, moves)
list :: NFA -> ([Ident],[(Ident,Char,[Ident])])
list ds = (map ident ds, list' ds [])
list' :: NFA -> [Ident] -> [(Ident,Char,[Ident])]
list' [] _ = []
list' ((State i c ds):xs) is = if i `elem` is
then list' xs is
else (i,c,map ident ds) : list' (xs++ds) (i:is)
ident :: State -> Ident
ident (State i _ _) = i
-- Compare the first n strings of the language of
-- regular expression r, as calculated by the direct
-- enumerator enumR and the automaton-based enumRA.
check :: Int -> Rexp -> Bool
check n r = take n (enumR r) == take n (enumRA r)
-- test n e r (Almost) silently generate the first n strings in
-- the language of r by enumerator e (enumR or enumRA).
-- This allows stress/timing tests without the cost of printing.
test n e r = length (take n (e r))
-- dexprs n Exhaustively list regular expressions in primitives
-- a and b with operator nesting depth at most n.
dexprs :: Int -> [Rexp]
dexprs 0 = [a, b]
dexprs i = let x = dexprs (i-1)
in x \/ (xprod Cat x x) \/ (xprod Alt x x) \/ (map Clo x)
-- sexprs n Exhaustively list regular expressions of
-- size n built from primitives in sexprs 1.
-- cumsexprs n Cumulatively list expressions of size n or less.
sexprs :: Int -> [Rexp]
sexprs 0 = []
sexprs 1 = [Nil, Eps, a, b]
sexprs n = map Clo (sexprs (n-1)) ++
foldr (++) [] [xprod Cat p q ++ xprod Alt p q |
(p,q) <- [((sexprs i), (sexprs (n-1-i))) |
i <- [1..(n-1)]]]
cumsexprs :: Int -> [Rexp]
cumsexprs 1 = sexprs 1
cumsexprs n = cumsexprs (n-1) \/ sexprs n
-- dcheck (scheck) Check first m strings for every
-- regular expression of depth (size) n or less.
dcheck, scheck :: Int -> Int -> Bool
dcheck m n = and (map (check m) (dexprs n))
scheck m n = and (map (check m) (cumsexprs n))
-- Test expressions
a = Sym 'a'
b = Sym 'b'
c = Sym 'c'
aa = Cat a a
bb = Cat b b
ab = Cat a b
ba = Cat b a
a_a = Alt a a
b_b = Alt b b
a_b = Alt a b
sandwich = Cat a (Cat (Clo b) a) -- a b* a
even_a = Clo (Alt sandwich b) -- (ab*a|b)*
rep 1 x = x
rep n x = Cat x (rep (n-1) x) -- x^n (catenation of n strings in x)
-- An NFA whose equivalent DFA is exponentially large
aho n = (Cat (Clo a_b) (Cat a (rep n a_b))) -- (a|b)*a(a|b)^n
-- Used as an example in Darrell Raymond, "Grail: a
-- C++ library for finite-state machines and regular expressions"
-- Tech report #358, Dept of CS, U of Western Ontario (March 1994)
grail = Cat (Clo (Alt a b)) (Cat a (Cat b c))