I\'m trying to understand how do I modify the binary search for it work for first and last occurrences, surely I can find some code on the web but I\'m trying to reach deep
Here are 4 variants of binary search, plus test code. In abstract terms, these search an ordered array, X[1..N], which may contain repeated data values, for a particular value T.
BinSearch_A()
— find an arbitrary index P for which X[P] = T.BinSearch_B()
— find the minimum index P for which X[P] = T.BinSearch_C()
— find the maximum index P for which X[P] = Y.BinSearch_D()
— find the range of indexes P..Q for which X[P] = T (but X[P-1] != T) and for which X[Q] = T (but X[Q+1] != T).BinSearch_D()
is what is wanted in the question, but understanding the other three will help with understanding the final result.
The code is based on material from chapter 8 of Programming Pearls, 1st Edn. It shows BinSearch_A()
— which it develops in an earlier chapter — and then modifies it to BinSearch_B()
as the first step in the optimization of the search for a value in an array of exactly 1000 entries.
The code presented contains the pseudo-code for 1-based arrays, actual C code for 0-based arrays, and a test harness which validates the data and results extensively. It is 330 lines, of which 30 are blank, about nearly 100 are comments, 12 are headers, types and declarations, 90 are the 4 implementations, and 100 are test code.
Note that if you know that the values in the array are unique, then BinSearch_D()
is not the correct algorithm to use (use BinSearch_A()
). Similarly, if it doesn't matter which element of a set of identical elements is selected, BinSearch_D()
is not the correct algorithm to use (use BinSearch_A()
). The two variants BinSearch_B()
and BinSearch_C()
are specialized, but do less work than BinSearch_D()
if you only need the minimum index or maximum index at which the searched-for value occurs.
#include
#include
#include
typedef struct Pair
{
int lo;
int hi;
} Pair;
extern Pair BinSearch_D(int N, const int X[N], int T);
extern int BinSearch_A(int N, const int X[N], int T);
extern int BinSearch_B(int N, const int X[N], int T);
extern int BinSearch_C(int N, const int X[N], int T);
/*
** Programming Pearls, 1st Edn, 8.3 Major Surgery - Binary Search
**
** In each fragment, P contains the result (0 indicates 'not found').
**
** Search for T in X[1..N] - BinSearch-A
**
** L := 1; U := N
** loop
** # Invariant: if T is in X, it is in X[L..U]
** if L > U then
** P := 0; break;
** M := (L+U) div 2
** case
** X[M] < T: L := M+1
** X[M] = T: P := M; break
** X[M] > T: U := M-1
**
** Search for first occurrence of T in X[1..N] - BinSearch-B
**
** L := 0; U := N+1
** while L+1 != U do
** # Invariant: X[L] < T and X[U] >= T and L < U
** M := (L+U) div 2
** if X[M] < T then
** L := M
** else
** U := M
** # Assert: L+1 = U and X[L] < T and X[U] >= T
** P := U
** if P > N or X[P] != T then P := 0
**
** Search for last occurrence of T in X[1..N] - BinSearch-C
** Adapted from BinSearch-B (not in Programming Pearls)
**
** L := 0; U := N+1
** while L+1 != U do
** # Invariant: X[L] <= T and X[U] > T and L < U
** M := (L+U) div 2
** if X[M] <= T then
** L := M
** else
** U := M
** # Assert: L+1 = U and X[L] < T and X[U] >= T
** P := L
** if P = 0 or X[P] != T then P := 0
**
** C implementations of these deal with X[0..N-1] instead of X[1..N],
** and return -1 when the value is not found.
*/
int BinSearch_A(int N, const int X[N], int T)
{
int L = 0;
int U = N-1;
while (1)
{
/* Invariant: if T is in X, it is in X[L..U] */
if (L > U)
return -1;
int M = (L + U) / 2;
if (X[M] < T)
L = M + 1;
else if (X[M] > T)
U = M - 1;
else
return M;
}
assert(0);
}
int BinSearch_B(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
/* Invariant: X[L] < T and X[U] >= T and L < U */
int M = (L + U) / 2;
if (X[M] < T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] < T) && (U >= N || X[U] >= T));
int P = U;
if (P >= N || X[P] != T)
P = -1;
return P;
}
int BinSearch_C(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
/* Invariant: X[L] <= T and X[U] > T and L < U */
int M = (L + U) / 2;
if (X[M] <= T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] <= T) && (U >= N || X[U] > T));
int P = L;
if (P < 0 || X[P] != T)
P = -1;
return P;
}
/*
** If value is found in the array (elements array[0]..array[a_size-1]),
** then the returned Pair identifies the lowest index at which value is
** found (in lo) and the highest value (in hi). The lo and hi values
** will be the same if there's only one entry that matches.
**
** If the value is not found in the array, the pair (-1, -1) is returned.
**
** -- If the values in the array are unique, then this is not the binary
** search to use.
** -- If it doesn't matter which one of multiple identical keys are
** returned, this is not the binary search to use.
**
** ------------------------------------------------------------------------
**
** Another way to look at this is:
** -- Lower bound is largest index lo such that a[lo] < value and a[lo+1] >= value
** -- Upper bound is smallest index hi such that a[hi] > value and a[hi-1] <= value
** -- Range is then lo+1..hi-1.
** -- If the values is not found, the value (-1, -1) is returned.
**
** Let's review:
** == Data: 2, 3, 3, 3, 5, 5, 6, 8 (N = 8)
** Searches and results:
** == 1 .. lo = -1, hi = -1 R = (-1, -1) not found
** == 2 .. lo = -1, hi = 1 R = ( 0, 0) found
** == 3 .. lo = 0, hi = 4 R = ( 1, 3) found
** == 4 .. lo = -1, hi = -1 R = (-1, -1) not found
** == 5 .. lo = 3, hi = 6 R = ( 4, 5) found
** == 6 .. lo = 5, hi = 7 R = ( 6, 6) found
** == 7 .. lo = -1, hi = -1 R = (-1, -1) not found
** == 8 .. lo = 6, hi = 8 R = ( 7, 7) found
** == 9 .. lo = -1, hi = -1 R = (-1, -1) not found
**
** Code created by combining BinSearch_B() and BinSearch_C() into a
** single function. The two separate ranges of values to be searched
** (L_lo:L_hi vs U_lo:U_hi) are almost unavoidable as if there are
** repeats in the data, the values diverge.
*/
Pair BinSearch_D(int N, const int X[N], int T)
{
int L_lo = -1;
int L_hi = N;
int U_lo = -1;
int U_hi = N;
while (L_lo + 1 != L_hi || U_lo + 1 != U_hi)
{
if (L_lo + 1 != L_hi)
{
/* Invariant: X[L_lo] < T and X[L_hi] >= T and L_lo < L_hi */
int L_md = (L_lo + L_hi) / 2;
if (X[L_md] < T)
L_lo = L_md;
else
L_hi = L_md;
}
if (U_lo + 1 != U_hi)
{
/* Invariant: X[U_lo] <= T and X[U_hi] > T and U_lo < U_hi */
int U_md = (U_lo + U_hi) / 2;
if (X[U_md] <= T)
U_lo = U_md;
else
U_hi = U_md;
}
}
assert(L_lo+1 == L_hi && (L_lo == -1 || X[L_lo] < T) && (L_hi >= N || X[L_hi] >= T));
int L = L_hi;
if (L >= N || X[L] != T)
L = -1;
assert(U_lo+1 == U_hi && (U_lo == -1 || X[U_lo] <= T) && (U_hi >= N || X[U_hi] > T));
int U = U_lo;
if (U < 0 || X[U] != T)
U = -1;
return (Pair) { .lo = L, .hi = U };
}
/* Test support code */
static void check_sorted(const char *a_name, int size, const int array[size])
{
int ok = 1;
for (int i = 1; i < size; i++)
{
if (array[i-1] > array[i])
{
fprintf(stderr, "Out of order: %s[%d] = %d, %s[%d] = %d\n",
a_name, i-1, array[i-1], a_name, i, array[i]);
ok = 0;
}
}
if (!ok)
exit(1);
}
static void dump_array(const char *a_name, int size, const int array[size])
{
printf("%s Data: ", a_name);
for (int i = 0; i < size; i++)
printf(" %2d", array[i]);
putchar('\n');
}
/* This interface could be used instead of the Pair returned by BinSearch_D() */
static void linear_search(int size, const int array[size], int value, int *lo, int *hi)
{
*lo = *hi = -1;
for (int i = 0; i < size; i++)
{
if (array[i] == value)
{
if (*lo == -1)
*lo = i;
*hi = i;
}
else if (array[i] > value)
break;
}
}
static void test_abc_search(const char *a_name, int size, const int array[size])
{
dump_array(a_name, size, array);
check_sorted(a_name, size, array);
for (int i = array[0] - 1; i < array[size - 1] + 2; i++)
{
int a_idx = BinSearch_A(size, array, i);
int b_idx = BinSearch_B(size, array, i);
int c_idx = BinSearch_C(size, array, i);
printf("T %2d: A %2d, B %2d, C %2d\n", i, a_idx, b_idx, c_idx);
assert(a_idx != -1 || (b_idx == -1 && c_idx == -1));
assert(b_idx != -1 || (c_idx == -1 && a_idx == -1));
assert(c_idx != -1 || (a_idx == -1 && b_idx == -1));
assert(a_idx == -1 || (array[a_idx] == i && array[b_idx] == i && array[c_idx] == i));
assert(a_idx == -1 || (a_idx >= b_idx && a_idx <= c_idx));
int lo;
int hi;
linear_search(size, array, i, &lo, &hi);
assert(lo == b_idx && hi == c_idx);
}
}
static void test_d_search(const char *a_name, int size, const int array[size], const Pair results[])
{
dump_array(a_name, size, array);
check_sorted(a_name, size, array);
for (int i = array[0] - 1, j = 0; i < array[size - 1] + 2; i++, j++)
{
Pair result = BinSearch_D(size, array, i);
printf("%2d: (%d, %d) vs (%d, %d)\n", i, result.lo, result.hi, results[j].lo, results[j].hi);
int lo;
int hi;
linear_search(size, array, i, &lo, &hi);
assert(lo == result.lo && hi == result.hi);
}
}
int main(void)
{
int A[] = { 1, 2, 2, 4, 5, 5, 5, 5, 5, 6, 8, 8, 9, 10 };
enum { A_SIZE = sizeof(A) / sizeof(A[0]) };
test_abc_search("A", A_SIZE, A);
int B[] = { 10, 12, 12, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 18, 18, 18, 19, 19, 19, 19, };
enum { B_SIZE = sizeof(B) / sizeof(B[0]) };
test_abc_search("B", B_SIZE, B);
int C[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, };
enum { C_SIZE = sizeof(C) / sizeof(C[0]) };
test_abc_search("C", C_SIZE, C);
/* Results for BinSearch_D() on array A */
static const Pair results_A[] =
{
{ -1, -1 }, { 0, 0 }, { 1, 2 },
{ -1, -1 }, { 3, 3 }, { 4, 8 },
{ 9, 9 }, { -1, -1 }, { 10, 11 },
{ 12, 12 }, { 13, 13 }, { -1, -1 },
};
test_d_search("A", A_SIZE, A, results_A);
int D[] = { 2, 3, 3, 3, 5, 5, 6, 8 };
enum { D_SIZE = sizeof(D) / sizeof(D[0]) };
static const Pair results_D[] =
{
{ -1, -1 }, { 0, 0 }, { 1, 3 },
{ -1, -1 }, { 4, 5 }, { 6, 6 },
{ -1, -1 }, { 7, 7 }, { -1, -1 },
};
test_d_search("D", D_SIZE, D, results_D);
return 0;
}
Example output:
A Data: 1 2 2 4 5 5 5 5 5 6 8 8 9 10
T 0: A -1, B -1, C -1
T 1: A 0, B 0, C 0
T 2: A 2, B 1, C 2
T 3: A -1, B -1, C -1
T 4: A 3, B 3, C 3
T 5: A 6, B 4, C 8
T 6: A 9, B 9, C 9
T 7: A -1, B -1, C -1
T 8: A 10, B 10, C 11
T 9: A 12, B 12, C 12
T 10: A 13, B 13, C 13
T 11: A -1, B -1, C -1
B Data: 10 12 12 12 14 15 15 15 15 15 15 15 16 16 16 18 18 18 19 19 19 19
T 9: A -1, B -1, C -1
T 10: A 0, B 0, C 0
T 11: A -1, B -1, C -1
T 12: A 1, B 1, C 3
T 13: A -1, B -1, C -1
T 14: A 4, B 4, C 4
T 15: A 10, B 5, C 11
T 16: A 13, B 12, C 14
T 17: A -1, B -1, C -1
T 18: A 16, B 15, C 17
T 19: A 19, B 18, C 21
T 20: A -1, B -1, C -1
C Data: 1 2 3 4 5 6 7 8 9 10 11 12
T 0: A -1, B -1, C -1
T 1: A 0, B 0, C 0
T 2: A 1, B 1, C 1
T 3: A 2, B 2, C 2
T 4: A 3, B 3, C 3
T 5: A 4, B 4, C 4
T 6: A 5, B 5, C 5
T 7: A 6, B 6, C 6
T 8: A 7, B 7, C 7
T 9: A 8, B 8, C 8
T 10: A 9, B 9, C 9
T 11: A 10, B 10, C 10
T 12: A 11, B 11, C 11
T 13: A -1, B -1, C -1
A Data: 1 2 2 4 5 5 5 5 5 6 8 8 9 10
0: (-1, -1) vs (-1, -1)
1: (0, 0) vs (0, 0)
2: (1, 2) vs (1, 2)
3: (-1, -1) vs (-1, -1)
4: (3, 3) vs (3, 3)
5: (4, 8) vs (4, 8)
6: (9, 9) vs (9, 9)
7: (-1, -1) vs (-1, -1)
8: (10, 11) vs (10, 11)
9: (12, 12) vs (12, 12)
10: (13, 13) vs (13, 13)
11: (-1, -1) vs (-1, -1)
D Data: 2 3 3 3 5 5 6 8
1: (-1, -1) vs (-1, -1)
2: (0, 0) vs (0, 0)
3: (1, 3) vs (1, 3)
4: (-1, -1) vs (-1, -1)
5: (4, 5) vs (4, 5)
6: (6, 6) vs (6, 6)
7: (-1, -1) vs (-1, -1)
8: (7, 7) vs (7, 7)
9: (-1, -1) vs (-1, -1)