首页 > 解决方案 > 在构建正则表达式引擎时处理类似 `.*` 之后的尾随模式

问题描述

我尝试使用 Haskell 中的解析器组合器来实现正则表达式引擎。

这个想法是首先解析模式以构建一个正则表达式解析器,然后使用解析器解析输入文本。

但是我的代码无法处理这种情况:如果模式类似于a?(a|b).*h+,那么h+after.*将永远不会满足,因为该.*部分将消耗所有输入。

(由于效率的原因,通常<|>是用tryfunc 实现的。但在这里我不认为是这种情况。

下面是我的代码

{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Matcher where

import           Data.Function (on)
import           Data.Functor (($>))
import           Control.Monad
import           Control.Applicative

newtype Parser a = Parser { runParser :: String -> Maybe (a, String) }

parse :: Parser a -> String -> Maybe a
parse parser inp = case runParser parser inp of
  Just (a, []) -> Just a
  _ -> Nothing

instance Functor Parser where
  fmap t (Parser g) = Parser $ \inp -> fmap (\(a, inp') -> (t a, inp')) (g inp)

instance Applicative Parser where
  pure a = Parser $ \inp -> Just (a, inp)

  (Parser f) <*> (Parser g) = Parser
    $ \inp -> case f inp of
      Nothing        -> Nothing
      Just (t, inp') -> case g inp' of
        Nothing         -> Nothing
        Just (a, inp'') -> Just (t a, inp'')

instance Alternative Parser where
  empty = Parser $ const Nothing

  (Parser f) <|> (Parser g) = Parser
    $ \inp -> case f inp of
      Nothing -> g inp
      Just r  -> Just r

  some p = (:) <$> p <*> (some p <|> pure [])

instance Monad Parser where
  (Parser f) >>= g = Parser
    $ \inp -> case f inp of
      Nothing        -> Nothing
      Just (a, inp') -> runParser (g a) inp'

satisfy p = Parser
  $ \case
    (x:xs)
      | p x -> Just (x, xs)
    _      -> Nothing

char :: Char -> Parser Char
char c = satisfy (== c)

string :: String -> Parser String
string "" = pure ""
string (c:cs) = do
  char c
  string cs
  return (c:cs)

between :: Parser a -> Parser b -> Parser c -> Parser b
between left content right = left *> content <* right

unit :: Parser (Parser String)
unit = (do
          c <- satisfy (`notElem` ['.', '*', '?', '+', '(', ')', '|'])
          return $ fmap (:[]) (char c))
  <|> (char '.' $> fmap (:[]) (satisfy (const True)))
  <|> oneOf

oneOf :: Parser (Parser String)
oneOf = between (char '(') content (char ')')
  where
    content = do
      p1 <- unit
      rest <- many $ char '|' *> unit
      return $ foldr (<|>) p1 rest

oneOrMore :: Parser (Parser String)
oneOrMore = do
  arp <- unit
  char '+'
  return $ fmap concat (some arp)

zeroOrMore :: Parser (Parser String)
zeroOrMore = do
  arp <- unit
  char '*'
  return $ fmap concat (many arp)

zeroOrOne :: Parser (Parser String)
zeroOrOne = do
  arp <- unit
  char '?'
  return $ arp <|> pure ""

reg :: Parser (Parser String)
reg = fmap
  (foldr (liftA2 (++)) (pure ""))
  (some $ zeroOrMore <|> zeroOrOne <|> oneOrMore <|> unit)

createRegParser :: String -> Maybe (Parser String)
createRegParser = parse reg

example = createRegParser "a?(a|b).*h+"

标签: regexparsinghaskellparser-combinators

解决方案


推荐阅读