Case Insensitive String comp in C

耗尽温柔 提交于 2019-12-17 06:31:52

问题


I have two postcodes char* that I want to compare, ignoring case. Is there a function to do this?

Or do I have to loop through each use the tolower function and then do the comparison?

Any idea how this function will react with numbers in the string

Thanks


回答1:


There is no function that does this in the C standard. Unix systems that comply with POSIX are required to have strcasecmp in the header strings.h; Microsoft systems have stricmp. To be on the portable side, write your own:

int strcicmp(char const *a, char const *b)
{
    for (;; a++, b++) {
        int d = tolower((unsigned char)*a) - tolower((unsigned char)*b);
        if (d != 0 || !*a)
            return d;
    }
}

But note that none of these solutions will work with UTF-8 strings, only ASCII ones.




回答2:


Take a look to strcasecmp() in strings.h.




回答3:


I would use stricmp(). It compares two strings without regard to case.

Note that, in some cases, converting the string to lower case can be faster.




回答4:


I've found built-in such method named from which contains additional string functions to the standard header .

Here's the relevant signatures :

int  strcasecmp(const char *, const char *);
int  strncasecmp(const char *, const char *, size_t);

I also found it's synonym in xnu kernel (osfmk/device/subrs.c) and it's implemented in the following code, so you wouldn't expect to have any change of behavior in number compared to the original strcmp function.

tolower(unsigned char ch) {
    if (ch >= 'A' && ch <= 'Z')
        ch = 'a' + (ch - 'A');
    return ch;
 }

int strcasecmp(const char *s1, const char *s2) {
    const unsigned char *us1 = (const u_char *)s1,
                        *us2 = (const u_char *)s2;

    while (tolower(*us1) == tolower(*us2++))
        if (*us1++ == '\0')
            return (0);
    return (tolower(*us1) - tolower(*--us2));
}



回答5:


I'm not really a fan of the most-upvoted answer here (in part because it isn't correct since it should continue if it reads a null terminator in either string--but not both strings at once--and it doesn't do this), so I wrote my own.

This is a direct drop-in replacement for strncmp(), and has been fully tested with numerous test cases, as shown below:

The code only:

#include <ctype.h> // for `tolower()`
#include <limits.h> // for `INT_MIN`

// Case-insensitive `strncmp()`
static inline int strncmpci(const char * str1, const char * str2, size_t num)
{
    int ret_code = INT_MIN;
    size_t chars_compared = 0;

    if (!str1 || !str2)
    {
        goto done;
    }

    while ((*str1 || *str2) && (chars_compared < num))
    {
        ret_code = tolower((int)(*str1)) - tolower((int)(*str2));
        if (ret_code != 0)
        {
            break;
        }
        chars_compared++;
        str1++;
        str2++;
    }

done:
    return ret_code;
}

Fully-commented version:

#include <ctype.h> // for `tolower()`
#include <limits.h> // for `INT_MIN`

/*

Case-insensitive string compare (strncmp case-insensitive)
- Identical to strncmp except case-insensitive. See: http://www.cplusplus.com/reference/cstring/strncmp/
- Aided/inspired, in part, by: https://stackoverflow.com/a/5820991/4561887

str1    C string 1 to be compared
str2    C string 2 to be compared
num     max number of chars to compare

return:
(essentially identical to strncmp)
INT_MIN  invalid arguments (one or both of the input strings is a NULL pointer)
<0       the first character that does not match has a lower value in str1 than in str2
 0       the contents of both strings are equal
>0       the first character that does not match has a greater value in str1 than in str2

*/
static inline int strncmpci(const char * str1, const char * str2, size_t num)
{
    int ret_code = INT_MIN;

    size_t chars_compared = 0;

    // Check for NULL pointers
    if (!str1 || !str2)
    {
        goto done;
    }

    // Continue doing case-insensitive comparisons, one-character-at-a-time, of str1 to str2, 
    // as long as at least one of the strings still has more characters in it, and we have
    // not yet compared num chars.
    while ((*str1 || *str2) && (chars_compared < num))
    {
        ret_code = tolower((int)(*str1)) - tolower((int)(*str2));
        if (ret_code != 0)
        {
            // The 2 chars just compared don't match
            break;
        }
        chars_compared++;
        str1++;
        str2++;
    }

done:
    return ret_code;
}

Test code: (run it online here): https://onlinegdb.com/B1Qoj0W_N

int main()
{
    printf("Hello World\n\n");

    const char * str1;
    const char * str2;
    size_t n;

    str1 = "hey";
    str2 = "HEY";
    n = 3;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "heY";
    str2 = "HeY";
    n = 3;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "hey";
    str2 = "HEdY";
    n = 3;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "heY";
    str2 = "HeYd";
    n = 3;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));   
    printf("\n");

    str1 = "heY";
    str2 = "HeYd";
    n = 6;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "hey";
    str2 = "hey";
    n = 6;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "hey";
    str2 = "heyd";
    n = 6;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    str1 = "hey";
    str2 = "heyd";
    n = 3;
    printf("strncmpci(%s, %s, %u) = %i\n", str1, str2, n, strncmpci(str1, str2, n));
    printf("strncmp(%s, %s, %u) = %i\n", str1, str2, n, strncmp(str1, str2, n));
    printf("\n");

    return 0;
}

Sample output:

Hello World

strncmpci(hey, HEY, 3) = 0
strncmp(hey, HEY, 3) = 32

strncmpci(heY, HeY, 3) = 0
strncmp(heY, HeY, 3) = 32

strncmpci(hey, HEdY, 3) = 21
strncmp(hey, HEdY, 3) = 32

strncmpci(heY, HeYd, 3) = 0
strncmp(heY, HeYd, 3) = 32

strncmpci(heY, HeYd, 6) = -100
strncmp(heY, HeYd, 6) = 32

strncmpci(hey, hey, 6) = 0
strncmp(hey, hey, 6) = 0

strncmpci(hey, heyd, 6) = -100
strncmp(hey, heyd, 6) = -100

strncmpci(hey, heyd, 3) = 0
strncmp(hey, heyd, 3) = 0

References:

  1. This question & other answers here served as inspiration and gave some insight (Case Insensitive String comp in C)
  2. http://www.cplusplus.com/reference/cstring/strncmp/
  3. https://en.wikipedia.org/wiki/ASCII
  4. https://en.cppreference.com/w/c/language/operator_precedence



回答6:


Additional pitfalls to watch out for when doing case insensitive compares:


Comparing as lower or as upper case? (common enough issue)

Both below will return 0 with strcicmpL("A", "a") and strcicmpU("A", "a").
Yet strcicmpL("A", "_") and strcicmpU("A", "_") can return different signed results as '_' is often between the upper and lower case letters.

This affects the sort order when used with qsort(..., ..., ..., strcicmp). Non-standard library C functions like the commonly available stricmp() or strcasecmp() tend to be well defined and favor comparing via lowercase. Yet variations exist.

int strcicmpL(char const *a, char const *b) {
  while (*a) {
    int d = tolower(*a) - tolower(*b);
    if (d) {
        return d;
    } 
    a++;
    b++;
  } 
  return 0;
}

int strcicmpU(char const *a, char const *b) {
  while (*a) {
    int d = toupper(*a) - toupper(*b);
    if (d) {
        return d;
    } 
    a++;
    b++;
  } 
  return 0;
}

char can have a negative value. (not rare)

touppper(int) and tolower(int) are specified for unsigned char values and the negative EOF. Further, strcmp() returns results as if each char was converted to unsigned char, regardless if char is signed or unsigned.

tolower(*a); // Potential UB
tolower((unsigned char) *a); // Correct

Locale (less common)

Although character sets using ASCII code (0-127) are ubiquitous, the remainder codes tend to have locale specific issues. So strcasecmp("\xE4", "a") might return a 0 on one system and non-zero on another.


Unicode (the way of the future)

If a solution needs to handle more than ASCII consider a unicode_strcicmp(). As C lib does not provide such a function, a pre-coded function from some alternate library is recommended. Writing your own unicode_strcicmp() is a daunting task.


Do all letters map one lower to one upper? (pedantic)

[A-Z] maps one-to-one with [a-z], yet various locales map various lower case chracters to one upper and visa-versa. Further, some uppercase characters may lack a lower case equivalent and again, visa-versa.

This obliges code to covert through both tolower() and tolower().

int d = tolower(toupper(*a)) - tolower(toupper(*b));

Again, potential different results for sorting if code did tolower(toupper(*a)) vs. toupper(tolower(*a)).


Portability

@B. Nadolson recommends to avoid rolling your own strcicmp() and this is reasonable, except when code needs high equivalent portable functionality.

Below is an approach that even performed faster than some system provided functions. It does a single compare per loop rather than two by using 2 different tables that differ with '\0'. Your results may vary.

static unsigned char low1[UCHAR_MAX + 1] = {
  0, 1, 2, 3, ...
  '@', 'a', 'b', 'c', ... 'z', `[`, ...  // @ABC... Z[...
  '`', 'a', 'b', 'c', ... 'z', `{`, ...  // `abc... z{...
}
static unsigned char low2[UCHAR_MAX + 1] = {
// v--- Not zero, but A which matches none in `low1[]`
  'A', 1, 2, 3, ...
  '@', 'a', 'b', 'c', ... 'z', `[`, ...
  '`', 'a', 'b', 'c', ... 'z', `{`, ...
}

int strcicmp_ch(char const *a, char const *b) {
  // compare using tables that differ slightly.
  while (low1[(unsigned char)*a] == low2[(unsigned char)*b]) {
    a++;
    b++;
  }
  // Either strings differ or null character detected.
  // Perform subtraction using same table.
  return (low1[(unsigned char)*a] - low1[(unsigned char)*b]);
}



回答7:


You can get an idea, how to implement an efficient one, if you don't have any in the library, from here

It use a table for all 256 chars.

  • in that table for all chars, except letters - used its ascii codes.
  • for upper case letter codes - the table list codes of lower cased symbols.

then we just need to traverse a strings and compare our table cells for a given chars:

const char *cm = charmap,
        *us1 = (const char *)s1,
        *us2 = (const char *)s2;
while (cm[*us1] == cm[*us2++])
    if (*us1++ == '\0')
        return (0);
return (cm[*us1] - cm[*--us2]);



回答8:


static int ignoreCaseComp (const char *str1, const char *str2, int length)
{
    int k;
    for (k = 0; k < length; k++)
    {

        if ((str1[k] | 32) != (str2[k] | 32))
            break;
    }

    if (k != length)
        return 1;
    return 0;
}

Reference




回答9:


As others have stated, there is no portable function that works on all systems. You can partially circumvent this with simple ifdef:

#include <stdio.h>

#ifdef _WIN32
#include <string.h>
#define strcasecmp _stricmp
#else // assuming POSIX or BSD compliant system
#include <strings.h>
#endif

int main() {
    printf("%d", strcasecmp("teSt", "TEst"));
}



回答10:


Simple solution:

int str_case_ins_cmp(const char* a, const char* b) {
  int rc;

  while (1) {
    rc = tolower((unsigned char)*a) - tolower((unsigned char)*b);
    if (rc || !*a) {
      break;
    }

    ++a;
    ++b;
  }

  return rc;
}



回答11:


int strcmpInsensitive(char* a, char* b)
{
    return strcmp(lowerCaseWord(a), lowerCaseWord(b));
}

char* lowerCaseWord(char* a)
{
    char *b=new char[strlen(a)];
    for (int i = 0; i < strlen(a); i++)
    {
        b[i] = tolower(a[i]);   
    }
    return b;
}

good luck

Edit-lowerCaseWord function get a char* variable with, and return the lower case value of this char*. For example "AbCdE" for value of char*, will return "abcde".

Basically what it does is to take the two char* variables, after being transferred to lower case, and make use the strcmp function on them.

For example- if we call the strcmpInsensitive function for values of "AbCdE", and "ABCDE", it will first return both values in lower case ("abcde"), and then do strcmp function on them.



来源:https://stackoverflow.com/questions/5820810/case-insensitive-string-comp-in-c

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!