file name matching with wildcard

前端 未结 7 1626
闹比i
闹比i 2021-01-01 16:14

I need to implement something like my own file system. One operation would be the FindFirstFile. I need to check, if the caller passed something like ., sample*.cpp

相关标签:
7条回答
  • 2021-01-01 16:37

    The solution Mehrdad provided has exponential running time because it uses backtracking. It takes it like a second to figure out that "*a*??????*a*?????????a???????????????", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" is a match. Also there's no way to match a string with '*' or '?' in it.

    Here's an O(nm) implementation I came up with which has O(n) memory usage, where n is the length of the expression, m the length of the string. It also supports escaping ?, * and \.

    #include <stddef.h>
    #include <stdlib.h>
    #include <string.h>
    
    /** 
     * Wildcard matching.
     * See for example
     * http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/find_c_search_wildcard.mspx?mfr=true
     *
     *  * matches any amount of any characters, 
     *  ? matches any single character.
     *  c matches c.
     *  No escaping of * and ? implemented (these are not allowed in windows filenames anyways).
     *
     * Backtracking O(2^n) implementation.
     *
     * from http://stackoverflow.com/questions/3300419/file-name-matching-with-wildcard 
     * http://stackoverflow.com/a/12231681/524504
     */
    template<class Pattern, class Text>
    bool wildcard(
            Pattern const pat_begin,
            Pattern const pat_end,
            Text text_begin, 
            Text const text_end)
    {
        ptrdiff_t const pat_size = pat_end - pat_begin;
    
        /* initial pattern position stack (offsets into the pattern string) and its size c */
        ptrdiff_t stackbuf[64];
        size_t c                     = sizeof(stackbuf) / sizeof(*stackbuf);
        /* Stack base. Can be realloc'ed to some new memory location if we need more */
        ptrdiff_t *p                 = stackbuf;
    
        /* pointer to the location in the stack */
        size_t n                     = 0;
        p[n++]                       = 0; /* position 0 in the stack not used */
    
        /* text_begin updated to skip everything successfully consumed  */
        while (n > 0 && text_begin != text_end) 
        {
            for (size_t i = 0; i < n; i++) 
            {
                /* if we're at the end of the pattern, but not at the end of the 
                 * string, we might have done something wrong.
                 */
                if (p[i] == pat_size) 
                {
                    p[i--] = p[--n];
                    continue;
                }
                /* current pattern character */
                switch (*(pat_begin + p[i]))
                {
                    case '?': ++p[i]; break; /* simply advance pattern pointer */
                    case '*':
                              ptrdiff_t off;
                              off = p[i];
                              while (off < pat_size &&
                                      *(pat_begin + off) == '*')
                              { ++off; }
                              /* if the stack is full, reallocate */
                              if (n == c)
                              {
                                  ptrdiff_t const *const old = p;
                                  c *= 2;
                                  /* assert positive size
                                   * stack size never reduced anyways?
                                   */
                                  if (c == 0) { ++c; }
                                  size_t const size = c * sizeof(*p);
                                  /* cannot use realloc to copy original stack 
                                   * (ptr in realloc must be 
                                   * "Pointer to a memory block previously 
                                   * allocated with malloc, calloc or realloc."
                                   * must do manually
                                   */
                                  p = (ptrdiff_t *)realloc(
                                           old == stackbuf ? NULL : p,
                                          size);
                                  if (old == stackbuf)
                                  { memcpy(p, old, n * sizeof(*old)); }
                              }
                              /* store offset */
                              p[n++] = off;
                              break;
                    default: /* a normal character in the pattern */
                              /* must be matched exactly */
                              if (*(pat_begin + p[i]) == *text_begin)
                              { ++p[i]; } /* advance pattern pointer */
                              else /* if not, backtrack */
                              { p[i--] = p[--n]; }
                              break;
                }
            }
            ++text_begin;
        }
    
        bool success = false;
        if (text_begin == text_end)
        {
            while (!success && n > 0)
            {
                --n;
                while (p[n] != pat_size &&
                        *(pat_begin + p[n]) == '*')
                { ++p[n]; }
                if (p[n] == pat_size)
                { success = true; }
            }
        }
    
        /* need to free stack if it was reallocated */
        if (p != stackbuf) { free(p); }
        return success;
    }
    
    bool wildcard(char const *const pattern, char const *const text)
    {
        return wildcard(
                pattern,
                pattern + (pattern ? strlen(pattern) : 0),
                text,
                text + (text ? strlen(text) : 0));
    }
    
    bool wildcard(wchar_t const *const pattern, wchar_t const *const text)
    {
        return wildcard(
                pattern,
                pattern + (pattern ? wcslen(pattern) : 0),
                text,
                text + (text ? wcslen(text) : 0));
    }
    
    /**
     * Virtual Machine Style Regular Expression parsing/NFA emulation
     * used for wildcard matching. O(nm) algorithm.
     *
     * See http://swtch.com/~rsc/regexp/ for more on efficient RegEx parsing.
     *
     * Copyright (c) March 29, 2013 Paul Frischknecht
     * Can be distributed under the MIT licence, see bottom of file.
     */
    
    //#define wildcard_fast_DEBUG /* define to make the alogorithm printf what its doing */
    
    /**
     * Instructions are:
     *
     * star
     *   This launches a new thread at the current position and continues with the 
     *   current thread at the next position accepting any character.
     * char c
     *   Accepts exactly the character c
     * anychar
     *   Accepts any character.
     * end 
     *   Accepts the end of the input string.
     */
    enum InstructionType {
        End     = 0,
        Star    = 1,
        Char    = 2,
        AnyChar = 3
    };
    
    struct Instruction {
        InstructionType i;
        int c;       /* the caracter this instruction matches - undefined for non Char */
        int threads; /* threads active in the given instruction */
        /*
         * storing this here allows us to find out wether there is a thread
         * active at a given instruction in O(1)
         */
    };
    
    /** 
     * Wildcard (file path) matching.
     * See for example 
     * http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/find_c_search_wildcard.mspx?mfr=true
     *
     *  * matches any amount of any characters, 
     *  ? matches any single character.
     *  c matches c.
     *
     * Escaping of '*', '?' and '\' via '\*', '\?' and '\\' implemented. 
     * All other bytes are recognized directly as is. 
     * The string may not end in a single (uneven amount of) '\'.
     *
     * If text_end is 0, the text is assumed to be 0 terminated.
     * Same for pattern_end. The end is exclusive.
     *
     * @return A pointer to the character after the last character matched.
     *
     * Virtual machine O(nm) implementation, see http://swtch.com/~rsc/regexp/regexp2.html
     *
     * TODO Factor out the precompilation step to speed up matching multiple strings
     * against the same expression.
     */
    template<class Pattern, class Text>
    Text wildcard_fast(
            Pattern const pat_begin,
            Pattern const pat_end,
            Text const text_begin, 
            Text const text_end)
    {
        /* 
         * give some reasonable default program size so we don't have to call
         * malloc in most cases 
         */
        #define DEFAULTonstack_program_SIZE 256
        Instruction onstack_program[DEFAULTonstack_program_SIZE];
        /* this stores the current run and next run thread list, alternating */
        /* there are as most as many threads as instructions */
        Instruction* onstack_threads[DEFAULTonstack_program_SIZE*2]; 
    
        Instruction** threads      = onstack_threads;
        Instruction*  program      = onstack_program;
        int           program_size = sizeof(onstack_program)/sizeof(*program);
    
        Instruction* program_last_instruction = program + program_size - 1;
    
        /* program and pattern pointers */
        Instruction* pp   = program;
        Pattern      patp = pat_begin;
    
        /* compile */
    
        while ((pat_end == 0 && *patp != 0) || (pat_end != 0 && patp != pat_end)) {
    
            /* need more space */
            if (pp == program_last_instruction) {
    
                Instruction*  old_program = program;
                Instruction** old_threads = threads;
                int old_program_size      = program_size;
    
                program_size *= 2;
    
                program = (Instruction*) malloc(program_size*sizeof(*program));
                threads = (Instruction**)malloc(program_size*sizeof(*threads)*2);
    
                memcpy(program, old_program, old_program_size*sizeof(*program));
    
                if (old_program != onstack_program) {
                    free(old_program); free(old_threads);
                }
    
                program_last_instruction = program + program_size - 1;
                pp = pp - old_program + program;
            }
    
            /* parse pattern */
            switch (*patp) {
                case '*': 
                    pp->i = Star; 
                    /* Optimize multiple stars away */
                    while ((pat_end == 0 || patp+1 != pat_end) && *(patp+1) == '*')  
                        patp++; 
                    break;
    
                case '?':
                    pp->i = AnyChar; 
                    break;
    
                case '\\': 
                    pp->i = Char; 
                    pp->c = *(++patp); /* assumes string does not end in \ */
                    break;
    
                default: 
                    pp->i = Char; 
                    pp->c = *patp; 
                    break;
            }
    
            pp->threads = 0;
    
            pp++;
            patp++;
        }
    
        /* add the End instruction at the end */
        program_last_instruction = pp;
        pp->i                    = End;
        pp->threads              = 0;
    
        /* run */
        Text sp = text_begin; /* input string pointer */
        int n = 1, c = 0; /* next and current index */
        int threadcount[2];
    
        /* initialize */
        threadcount[c] = 1;
        threads[0] = program;
        threads[0]->threads++;
    
        /* run over text */
        while ((text_end == 0 && *sp != 0) || (text_end != 0 && sp != text_end)) {
            /* unless recreated, all threads will die */
            threadcount[n] = 0;
    
            /* run over threads */
            for (int i = 0; i < threadcount[c]; i++) {
    
                Instruction* inst = threads[2*i+c];
                switch (inst->i) {
                    case End: 
                        /* we may not reach end early */ 
                        /* kill this thread without recrating it */
                        inst->threads--; 
                        continue; /* with for loop */
    
                    case Char: 
                        if (*sp != inst->c) {
                            /* if the character is not matched, kill this thread */
                            inst->threads--;
                            continue;
                        }
                        break;
    
                    case Star: 
                        /* spawn off additional thread at current location */
    
                        if (inst->threads == 1) { 
                            /* only if there's noone active there yet */
                            threads[2*(threadcount[n]++)+n] = inst;
                            inst->threads++;
                        }
                        break;
                }
                /* 
                 * common actions: increase program counter for current thread,
                 * decrese amount of threads in last (current) instruction.
                 */
                inst->threads--;
                inst++;
                inst->threads++;
    
                /* respawn at new location in next iteration */
                threads[2*(threadcount[n]++)+n] = inst;
    
                if (inst->i == Star && (inst+1)->threads == 0) { 
                    /* 
                     * already follow no-match option to give us 
                     * more stuff to do 
                     */
                    threads[2*(threadcount[n]++)+n] = inst+1;
                    (inst+1)->threads++;
                }
    
    #ifdef wildcard_fast_DEBUG
                  for (int i = 0 ; i < threadcount[n]; i++) {
                    printf("thread %d at %d.\n", i, threads[2*i+n]-program);
                }
    #endif
    
            }
    
    #ifdef wildcard_fast_DEBUG
            const char *ns[] = {
                "end",
                "star",
                "char",
                "anychar",
            };
            for (Instruction* p = program; p->i; p++) {
                printf("%d. %s %c (%d threads)\n", p-program, ns[p->i], p->i == Char ? p->c : ' ', p->threads);
            }
    #endif
    
            /* swap next and current and advance */
            n = c;
            c = !c;
            sp++;
        }
    
        /* 
         * if there is no thread active in the End instruction when the 
         * end of the input was reached, this was no match
         */
        if (program_last_instruction->threads == 0) sp = 0; 
    
        if (program != onstack_program) {
            /* only need to free if we used malloc */
            free(program); free(threads);
        }
    
        return sp;
    }
    
    char const* wildcard_fast(
            char const *const pattern, 
            char const *const text) 
    {
        return wildcard_fast(
                pattern,
                (const char*)0,
                text,
                (const char*)0);
    }
    
    wchar_t const* wildcard_fast(
            wchar_t const *const pattern,
            wchar_t const *const text) 
    {
        return wildcard_fast(
                pattern,
                (const wchar_t*)0,
                text,
                (const wchar_t*)0);
    }
    
    
    /* tests */
    #ifndef INCLUDE_IMPLEMENTATION
    /* 
     * I just include this file in my projects and define this.
     * That works well for simple algorithms like this
     */
    
    #include <stdio.h>
    #include <time.h>
    
    int main() {
        struct {
            char* p;
            char* t;
            bool expected_result;
        } test[] = {
            {
                "",
                "",
                true
            },
            {
                "a",
                "",
                false
            },
            {
                "",
                "a",
                false
            },
            {
                "a",
                "a",
                true
            },        
            {
                "****.txt", 
                "hello.txt",
                true
            },
            {
                "*.txt", 
                "hello.tzt",
                false
            },
            {
                "*.t?t*", 
                "hello.tzt",
                true
            },
            {
                "*.*", 
                "hi.there",
                true
            },
            /* the wildcard implementation will fail this as it doesn't understand escaping */
            {
                "\\*who\\?\\?.*\\\\", 
                "*who??.there\\",
                true
            },        
            /* these take extraordinaryly long on the O(2^n) implementation */
            {
                "**a*************************??????***a************************?????????a???????????????", 
                "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
                true
            },
            /* runs a bit better if we omit the extra *'s. The fast implementation already optimizes these away. */
            {
                "*a*??????*a*?????????a???????????????", 
                "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
                true
            },
            {0,0} /* marks end of tests */
        };
    
        int t0 = clock();
        const char* result;
        const char* r[] = {"no match", "match"};
    
        for (int i = 0; test[i].p; i++) {
            printf("=== Test %d: Matching '%s' against '%s'...===\n", i, test[i].t, test[i].p);
    
            /* wildcard_fast */
            t0 = clock();
            result = wildcard_fast(test[i].p, test[i].t);
    
            printf("  wildcard_fast (took %d ms):\n", clock()-t0);
            if (!!result != test[i].expected_result)
                printf("    Test failed (reported %s instead of expected %s).\n",
                        r[!!result], r[test[i].expected_result]);
            else if (result) {
                printf("    %s\n", test[i].t);
                printf("    %*.c\n", result-test[i].t+1, '^');
            }
            else
                printf("    No match.\n");
    
            /* wildcard */
            t0 = clock();
            result = (const char*)
                wildcard(test[i].p, test[i].t);
    
            printf("  wildcard (took %d ms):\n", clock()-t0);
            if (!!result != test[i].expected_result)
                printf("    Test failed (reported %s instead of expected %s).\n",
                        r[!!result], r[test[i].expected_result]);
            else if (result) printf("    Match.\n");
            else printf("    No match.\n");
    
            printf("\n");
        }
    }
    #endif
    
    /*
     * Permission is hereby granted, free of charge, to any person
     * obtaining a copy of this software and associated
     * documentation files (the "Software"), to deal in the
     * Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute,
     * sublicense, and/or sell copies of the Software, and to
     * permit persons to whom the Software is furnished to do so,
     * subject to the following conditions:
     * 
     * The above copyright notice and this permission notice shall
     * be included in all copies or substantial portions of the
     * Software.
     * 
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
     * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
     * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
     * PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS
     * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     */
    

    This uses a virtual machine approach. See http://swtch.com/~rsc/regexp/ for more on efficitent regular expression parsing.

    0 讨论(0)
  • 2021-01-01 16:40

    PathMatchSpec. Though it suffers from the MAX_PATH limitation (i.e. can accept no more than 260 characters). You may be better off implementing your own matcher; it's not a lot of code.

    0 讨论(0)
  • 2021-01-01 16:49

    Have a look at the POSIX functions fnmatch, glob, and wordexp.

    0 讨论(0)
  • 2021-01-01 16:50

    There are quite a few such functions around. Here's a directory of various implementations, sorted into recursive and non-recursive, etc.

    In case you don't like the licensing there (or have trouble with the link, etc.) here's one possible implementation of a matching algorithm that at least closely approximates what Windows uses:

    #include <string.h>
    #include <iostream>
    
    bool match(char const *needle, char const *haystack) {
        for (; *needle != '\0'; ++needle) {
            switch (*needle) {
            case '?': 
                if (*haystack == '\0')
                    return false;
                ++haystack;
                break;
            case '*': {
                if (needle[1] == '\0')
                    return true;
                size_t max = strlen(haystack);
                for (size_t i = 0; i < max; i++)
                    if (match(needle + 1, haystack + i))
                        return true;
                return false;
            }
            default:
                if (*haystack != *needle)
                    return false;
                ++haystack;
            }
        }
        return *haystack == '\0';
    }
    
    #ifdef TEST
    #define CATCH_CONFIG_MAIN
    
    #include "catch.hpp"
    
    TEST_CASE("Matching", "[match]") {
        REQUIRE(match("a", "a") == true);
        REQUIRE(match("a", "b") == false);
        REQUIRE(match("a*", "a") == true);
        REQUIRE(match("a?", "a") == false);
        REQUIRE(match("a?", "ab") == true);
        REQUIRE(match("a*b", "ab") == true);
        REQUIRE(match("a*b", "acb") == true);
        REQUIRE(match("a*b", "abc") == false);
        REQUIRE(match("*a*??????a?????????a???????????????", 
            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == true);
    }
    
    #endif
    

    Since there was a discussion of complexity of some of the other answers, I'll note that I believe this has O(NM) complexity and O(M) storage use (where N is the size of the target string, and M is the size of the pattern).

    With @masterxilo's test pair:

    "*a*??????*a*?????????a???????????????", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
    

    ...this finds a match in approximately 3 microseconds on my machine. That is a lot slower than a typical pattern--most of my other tests run in about 300 nanoseconds or so on this particular machine.

    At the same time, @masterxilo's code takes approximately 11 microseconds to run on the same machine, so this is still around 3 to 4 times faster (not to mention being somewhat smaller and simpler).

    0 讨论(0)
  • 2021-01-01 16:52

    Here is a dependency free portable C++ version:

    #include <string>
    
    #include <string.h>
    
    bool wild_match(const std::string& str, const std::string& pat) {
      std::string::const_iterator str_it = str.begin();
      for (std::string::const_iterator pat_it = pat.begin(); pat_it != pat.end();
           ++pat_it) {
        switch (*pat_it) {
          case '?':
            if (str_it == str.end()) {
              return false;
            }
    
            ++str_it;
            break;
          case '*': {
            if (pat_it + 1 == pat.end()) {
              return true;
            }
    
            const size_t max = strlen(&*str_it);
            for (size_t i = 0; i < max; ++i) {
              if (wild_match(&*(pat_it + 1), &*(str_it + i))) {
                return true;
              }
            }
    
            return false;
          }
          default:
            if (*str_it != *pat_it) {
              return false;
            }
    
            ++str_it;
        }
      }
    
      return str_it == str.end();
    }
    
    0 讨论(0)
  • 2021-01-01 16:54

    For wildcard name matching using '*' and '?' try this (if you want to avoid boost, use std::tr1::regex):

    #include <boost/regex.hpp>
    #include <boost/algorithm/string/replace.hpp>
    
    using std::string;
    
    bool MatchTextWithWildcards(const string &text, string wildcardPattern, bool caseSensitive /*= true*/)
    {
        // Escape all regex special chars
        EscapeRegex(wildcardPattern);
    
        // Convert chars '*?' back to their regex equivalents
        boost::replace_all(wildcardPattern, "\\?", ".");
        boost::replace_all(wildcardPattern, "\\*", ".*");
    
        boost::regex pattern(wildcardPattern, caseSensitive ? regex::normal : regex::icase);
    
        return regex_match(text, pattern);
    }
    
    void EscapeRegex(string &regex)
    {
        boost::replace_all(regex, "\\", "\\\\");
        boost::replace_all(regex, "^", "\\^");
        boost::replace_all(regex, ".", "\\.");
        boost::replace_all(regex, "$", "\\$");
        boost::replace_all(regex, "|", "\\|");
        boost::replace_all(regex, "(", "\\(");
        boost::replace_all(regex, ")", "\\)");
        boost::replace_all(regex, "{", "\\{");
        boost::replace_all(regex, "{", "\\}");
        boost::replace_all(regex, "[", "\\[");
        boost::replace_all(regex, "]", "\\]");
        boost::replace_all(regex, "*", "\\*");
        boost::replace_all(regex, "+", "\\+");
        boost::replace_all(regex, "?", "\\?");
        boost::replace_all(regex, "/", "\\/");
    }
    
    0 讨论(0)
提交回复
热议问题