Tuesday, April 21, 2015

LeetCode OJ - Median of Two Sorted Arrays

Problem:

Solution:

The requirement on logarithmic run time hint on binary search. Indeed it is. Let's first generalize the problem to finding the kth element first, we are given two sorted array, and need to find the kth element in the merged array.

It is easier to also assume the elements are distinct, to do so, we imagine the two arrays are concatenated and therefore we can associate a unique index $i$ to each element, and we also imagine each element is modified as $A[i] = A[i] + \epsilon i$ with sufficiently small $\epsilon$, that allow us to think of each element as distinct.

Without a better choice, let's consider splitting the arrays by half. The left array becomes $[A, p, B]$ where $A$ is a set of number smaller than $p$, and $B$ is a set of element greater than $p$. Similarly, the right hand side got split into $[C, q, D]$.

Because every element in the array is now unique, only two possibilities can happen, either $p > q$ or $p < q$. For brevity let's look at only the case $p < q$, the other case is similar.

We know $A$ is smaller than $p$, and $p$ is smaller than $q$, so we know $A$ is smaller than $q$. We also know $C$ is smaller than $q$.

On the other hand, we know $D$ is greater than $q$, and know $q$ is greater than $p$, so we know $D$ is greater than $p$. We also know $D$ is greater than $p$.

As a summary, $A, p, C$ are all less than $q$. $B, q, D$ are all greater than $p$.

Now it is the key, either $A, p, C$ has at least $k$ element, or $B, q, D$ has at least $n - k$ element, so we reduced the problem, the rest is trivial.

Code:

#include "stdafx.h"

// https://leetcode.com/problems/median-of-two-sorted-arrays/

#include "LEET_MEDIAN_OF_TWO_SORTED_ARRAYS.h"
#include <map>
#include <iostream>
#include <sstream>
#include <vector>
#include <string>

using namespace std;

namespace _LEET_MEDIAN_OF_TWO_SORTED_ARRAYS
{
class Solution
{
public:
double findMedianSortedArrays(vector<int>& nums1, vector<int>& nums2)
{
int size1 = nums1.size();
int size2 = nums2.size();
int size = size1 + size2;
int mid = size / 2;
if (size % 2 == 0)
{
// a[10] => (a[4] + a[5]) * 0.5
return (select(nums1, nums2, 0, size1, 0, size2, mid - 1) + select(nums1, nums2, 0, size1, 0, size2, mid)) * 0.5;
}
else
{
// a[9] => a[4]
return select(nums1, nums2, 0, size1, 0, size2, mid);
}
}
private:
int select(vector<int>& nums1, vector<int>& nums2, int s1, int e1, int s2, int e2, int k)
{
int size1 = e1 - s1;
int size2 = e2 - s2;
int size = size1 + size2;
if (size1 == 0)
{
return nums2[s2 + k];
}
else if (size2 == 0)
{
return nums1[s1 + k];
}
else
{
// First, all elements n[i] = n[i] + epsilon * i with sufficiently small epsilon, which means all elements are unique (i measured from the two vectors concatenated)
int probe1 = size1 / 2; // To be determined
int probe2 = size2 / 2; // To be determined
int probe1_value = nums1[s1 + probe1];
int probe2_value = nums2[s2 + probe2];

// Denote all the numbers smaller than probe1 in nums1 be the set A with size a
// Denote all the numbers greater than probe1 in nums1 be the set B with size b
// Denote all the numbers smaller than probe2 in nums2 be the set C with size c
// Denote all the numbers greater than probe2 in nums2 be the set D with size d

int a = probe1;
int b = size1 - probe1 - 1;
int c = probe2;
int d = size2 - probe2 - 1;

// Just an algebraic identity, we have
// a + b + c + d = size1 + size2 - 2
//               = size - 2
//     a + c + 1 = size - b - d - 1

if (probe1_value <= probe2_value) // Note the assumption above, it is the same as saying probe1_value epsilon modified is less than as probe2_value
{
if (a + c + 1 >= k + 1)
{
// The number of elements smaller than probe2_value is at least a + c + 1
// so we can safely discard any values known to be larger than or equal to probe2_value
return select(nums1, nums2, s1, e1, s2, s2 + probe2, k);
}
else if (a + c + 1 <= k)
{
// The number of elements greater than probe1_value is at least b + d + 1
// The number of elements smaller than probe1_value is at most size - b - d - 1 = a + c + 1
// so we can safely discard any values known to be smaller than probe1_value
return select(nums1, nums2, s1 + probe1 + 1, e1, s2, e2, k - probe1 - 1);
}
}
else if (probe1_value > probe2_value)
{
if (a + c + 1 >= k + 1)
{
// The number of elements smaller than probe1_value is at least a + c + 1
// so we can safely discard any values known to be larger than or equal to probe1_value
return select(nums1, nums2, s1, s1 + probe1, s2, e2, k);
}
else if (a + c + 1 <= k)
{
// The number of elements greater than probe2_value is at least b + d + 1
// The number of elements smaller than probe2_value is at most size - b - d - 1 = a + c + 1
// so we can safely discard any values known to be smaller than probe2_value
return select(nums1, nums2, s1, e1, s2 + probe2 + 1, e2, k - probe2 - 1);
}
}

return 0;
}
}
};
};

using namespace _LEET_MEDIAN_OF_TWO_SORTED_ARRAYS;

int LEET_MEDIAN_OF_TWO_SORTED_ARRAYS()
{
Solution solution;
// int nums1[] = { };
int nums2[] = { 2, 3 };
vector<int> nums1v;
// vector<int> nums1v(nums1, nums1 + sizeof(nums1) / sizeof(nums1[0]));
vector<int> nums2v(nums2, nums2 + sizeof(nums2) / sizeof(nums2[0]));
cout << solution.findMedianSortedArrays(nums1v, nums2v) << endl;
return 0;
}