How do I write a simple regular expression pattern matching function in C or C++?

后端 未结 9 957
我寻月下人不归
我寻月下人不归 2021-02-01 06:58

This is a question in my paper test today, the function signature is

int is_match(char* pattern,char* string)

The pattern is limited to only A

相关标签:
9条回答
  • 2021-02-01 07:31

    A C program to find the index,from where the sub-string in the main string is going to start. enter code here

    #include<stdio.h>
    int mystrstr (const char *,const char *);
    int mystrcmp(char *,char *);
    int main()
    {
        char *s1,*s2;//enter the strings, s1 is main string and s2 is substring.
        printf("Index is %d\n",mystrstr(s1,s2));
        //print the index of the string if string is found
    }
    //search for the sub-string in the main string
    int mystrstr (const char *ps1,const char *ps2) 
    {
        int i=0,j=0,c=0,l,m;char *x,*y;
        x=ps1;
        y=ps2;
        while(*ps1++)i++;
        while(*ps2++)j++;
        ps1=x;
        ps2=y;
        char z[j];
        for(l=0;l<i-j;l++)
        {
            for(m=l;m<j+l;m++)
                //store the sub-string of similar size from main string
                z[c++]=ps1[m];
            z[c]='\0'
            c=0;
            if(mystrcmp(z,ps2)==0)
            break;
        }
        return l;
    }
    
    
    int mystrcmp(char *ps3,char *ps4) //compare two strings
    {
        int i=0;char *x,*y;
        x=ps3;y=ps4;
        while((*ps3!=0)&&(*ps3++==*ps4++))i++;      
        ps3=x;ps4=y;
        if(ps3[i]==ps4[i])
            return 0;
        if(ps3[i]>ps4[i])
            return +1;
        else
            return -1;
    }
    
    0 讨论(0)
  • 2021-02-01 07:35

    Here is recursive extendable implementation. Tested for first order of pattern complexity.

    #include <string.h>
    #include <string>
    #include <vector>
    #include <iostream>
    
    struct Match {
      Match():_next(0) {}
      virtual bool match(const char * pattern, const char * input) const {
        return !std::strcmp(pattern, input);
      }
      bool next(const char * pattern, const char * input) const {
        if (!_next) return false;
        return _next->match(pattern, input);
      }
      const Match * _next;
    };
    
    class MatchSet: public Match {
      typedef std::vector<Match *> Set;
      Set toTry;
    public:
      virtual bool match(const char * pattern, const char * input) const {
        for (Set::const_iterator i = toTry.begin(); i !=toTry.end(); ++i) {
          if ((*i)->match(pattern, input)) return true;
        }
        return false;
      }
      void add(Match * m) {
        toTry.push_back(m);
        m->_next = this;
      }
      ~MatchSet() {
        for (Set::const_iterator i = toTry.begin(); i !=toTry.end(); ++i)
          if ((*i)->_next==this) (*i)->_next = 0;
      }
    };
    
    struct MatchQuestion: public Match  {
      virtual bool match(const char * pattern, const char * input) const {
        if (pattern[0] != '?')
          return false;
        if (next(pattern+1, input))
          return true;
        if (next(pattern+1, input+1))
          return true;
        return false;
      }
    };
    
    struct MatchEmpty: public Match {
      virtual bool match(const char * pattern, const char * input) const {
        if (pattern[0]==0 && input[0]==0)
          return true;
        return false;
      }
    };
    
    struct MatchAsterisk: public Match {
      virtual bool match(const char * pattern, const char * input) const {
        if (pattern[0] != '*')
          return false;
        if (pattern[1] == 0) {
          return true;
        }
        for (int i = 0; input[i] != 0; ++i) {
          if (next(pattern+1, input+i))
            return true;
        }
        return false;
      }
    };
    
    struct MatchSymbol: public Match {
      virtual bool match(const char * pattern, const char * input) const {
        // TODO: consider cycle here to prevent unnecessary recursion
        // Cycle should detect special characters and call next on them
        // Current implementation abstracts from that
        if (pattern[0] != input[0])
          return false;
        return next(pattern+1, input+1);
      }
    };
    
    class DefaultMatch: public MatchSet {
      MatchEmpty empty;
      MatchQuestion question;
      MatchAsterisk asterisk;
      MatchSymbol symbol;
    public:
      DefaultMatch() {
        add(&empty);
        add(&question);
        add(&asterisk);
        add(&symbol);
      }
      void test(const char * p, const char * input) const {
        testOneWay(p, input);
        if (!std::strcmp(p, input)) return;
        testOneWay(input, p);
      }
      bool testOneWay(const char * p, const char * input) const {
        const char * eqStr = " == ";
        bool rv = match(p, input);
        if (!rv) eqStr = " != ";
        std::cout << p << eqStr << input << std::endl;
        return rv;
      }
    
    };
    
    
    int _tmain(int argc, _TCHAR* argv[])
    {
      using namespace std;
    
      typedef vector<string> Strings;
      Strings patterns;
    
      patterns.push_back("*");
      patterns.push_back("*hw");
      patterns.push_back("h*w");
      patterns.push_back("hw*");
    
      patterns.push_back("?");
      patterns.push_back("?ab");
      patterns.push_back("a?b");
      patterns.push_back("ab?");
    
      patterns.push_back("c");
      patterns.push_back("cab");
      patterns.push_back("acb");
      patterns.push_back("abc");
    
      patterns.push_back("*this homework?");
      patterns.push_back("Is this homework?");
      patterns.push_back("This is homework!");
      patterns.push_back("How is this homework?");
    
      patterns.push_back("hw");
      patterns.push_back("homework");
      patterns.push_back("howork");
    
      DefaultMatch d;
      for (unsigned i = 0; i < patterns.size(); ++i)
        for (unsigned j =i; j < patterns.size(); ++j)
          d.test(patterns[i].c_str(), patterns[j].c_str());
    
        return 0;
    }
    

    If something is unclear, ask.

    0 讨论(0)
  • 2021-02-01 07:37

    Cheat. Use #include <boost/regex/regex.hpp>.

    0 讨论(0)
  • 2021-02-01 07:40

    Brian Kernighan provided a short article on A Regular Expression Matcher that Rob Pike wrote as a demonstration program for a book they were working on. The article is a very nice read explaining a bit about the code and regular expressions in general.

    I have played with this code, making a few changes to experiment with some extensions such as to also return where in the string the pattern matches so that the substring matching the pattern can be copied from the original text.

    From the article:

    I suggested to Rob that we needed to find the smallest regular expression package that would illustrate the basic ideas while still recognizing a useful and non-trivial class of patterns. Ideally, the code would fit on a single page.

    Rob disappeared into his office, and at least as I remember it now, appeared again in no more than an hour or two with the 30 lines of C code that subsequently appeared in Chapter 9 of TPOP. That code implements a regular expression matcher that handles these constructs:

    c    matches any literal character c
    .    matches any single character
    ^    matches the beginning of the input string
    $    matches the end of the input string
    *    matches zero or more occurrences of the previous character
    

    This is quite a useful class; in my own experience of using regular expressions on a day-to-day basis, it easily accounts for 95 percent of all instances. In many situations, solving the right problem is a big step on the road to a beautiful program. Rob deserves great credit for choosing so wisely, from among a wide set of options, a very small yet important, well-defined and extensible set of features.

    Rob's implementation itself is a superb example of beautiful code: compact, elegant, efficient, and useful. It's one of the best examples of recursion that I have ever seen, and it shows the power of C pointers. Although at the time we were most interested in conveying the important role of a good notation in making a program easier to use and perhaps easier to write as well, the regular expression code has also been an excellent way to illustrate algorithms, data structures, testing, performance enhancement, and other important topics.

    The actual C source code from the article is very very nice.

    /* match: search for regexp anywhere in text */
    int match(char *regexp, char *text)
    {
        if (regexp[0] == '^')
            return matchhere(regexp+1, text);
        do {    /* must look even if string is empty */
            if (matchhere(regexp, text))
                return 1;
        } while (*text++ != '\0');
        return 0;
    }
    
    /* matchhere: search for regexp at beginning of text */
    int matchhere(char *regexp, char *text)
    {
        if (regexp[0] == '\0')
            return 1;
        if (regexp[1] == '*')
            return matchstar(regexp[0], regexp+2, text);
        if (regexp[0] == '$' && regexp[1] == '\0')
            return *text == '\0';
        if (*text!='\0' && (regexp[0]=='.' || regexp[0]==*text))
            return matchhere(regexp+1, text+1);
        return 0;
    }
    
    /* matchstar: search for c*regexp at beginning of text */
    int matchstar(int c, char *regexp, char *text)
    {
        do {    /* a * matches zero or more instances */
            if (matchhere(regexp, text))
                return 1;
        } while (*text != '\0' && (*text++ == c || c == '.'));
        return 0;
    }
    
    0 讨论(0)
  • 2021-02-01 07:43

    try to make a list of interesting test cases:

    is_match("dummy","dummy") should return true;

    is_match("dumm?y","dummy") should return true;

    is_match("dum?y","dummy") should return false;

    is_match("dum*y","dummy") should return true;

    and so on ...

    then see how to make the easier test pass, then the next one ...

    0 讨论(0)
  • 2021-02-01 07:46

    The full power of regular expressions and finite state machines are not needed to solve this problem. As an alternative there is a relatively simple dynamic programming solution.

    Let match(i, j) be 1 if it is possible to match the the sub-string string[i..n-1] with the sub-pattern pattern[j, m - 1], where n and m are the lengths of string and pattern respectively. Otherwise let match(i, j) be 0.

    The base cases are:

    • match(n, m) = 1, you can match an empty string with an empty pattern;

    • match(i, m) = 0, you can't match a non-empty string with an empty pattern;

    The transition is divided into 3 cases depending on whether the current sub-pattern starts with a character followed by a '*', or a character followed by a '?' or just starts with a character with no special symbol after it.

    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    
    int is_match(char* pattern, char* string)
    {
      int n = strlen(string);
      int m = strlen(pattern);
    
      int i, j;
      int **match;
    
      match = (int **) malloc((n + 1) * sizeof(int *));
      for(i = 0; i <= n; i++) {
        match[i] = (int *) malloc((m + 1) * sizeof(int));
      }
    
      for(i = n; i >= 0; i--) {
        for(j = m; j >= 0; j--) {
          if(i == n && j == m) {
            match[i][j] = 1;
          }
          else if(i < n && j == m) {
            match[i][j] = 0;
          }
          else {
            match[i][j] = 0;
            if(pattern[j + 1] == '*') {
              if(match[i][j + 2]) match[i][j] = 1;
              if(i < n && pattern[j] == string[i] && match[i + 1][j]) match[i][j] = 1;
            }
            else if(pattern[j + 1] == '?') {
              if(match[i][j + 2]) match[i][j] = 1;
              if(i < n && pattern[j] == string[i] && match[i + 1][j + 2]) match[i][j] = 1;
            }
            else if(i < n && pattern[j] == string[i] && match[i + 1][j + 1]) {
              match[i][j] = 1;
            }
          }
        }
      }
    
      int result = match[0][0];
    
      for(i = 0; i <= n; i++) {
        free(match[i]);
      }
    
      free(match);
    
      return result;
    }
    
    int main(void)
    {
      printf("is_match(dummy, dummy)  = %d\n", is_match("dummy","dummy"));
      printf("is_match(dumm?y, dummy) = %d\n", is_match("dumm?y","dummy"));
      printf("is_match(dum?y, dummy)  = %d\n", is_match("dum?y","dummy"));
      printf("is_match(dum*y, dummy)  = %d\n", is_match("dum*y","dummy")); 
    
      system("pause");
    
      return 0;
    }
    

    The time complexity of this approach is O(n * m). The memory complexity is also O(n * m) but with a simple modification can be reduced to O(m).

    0 讨论(0)
提交回复
热议问题