Difficulty: Medium, Asked-in: Facebook, Amazon, Microsoft
Key takeaway: An excellent problem to learn problem-solving using hashing and sliding window approach.
You are given an array X[] of size n and an integer k, write a program to count the distinct elements present in every k sized window of the array.
Input: X[ ] = [1, 1, 1, 3, 4, 2, 3], k = 4, Output: [2, 3, 4, 3]
Explanation:
Input: X[ ] = [1, 2, 4, 4, 2], k = 3, Output: [3, 2, 2]
Explanation:
The basic idea would be to consider each window of size k while traversing the array and simultaneously counting the distinct elements in each window.
int* getDistinctCount(int X[], int n, int k)
{
int* output = new int[n - k + 1];
for (int i = 0; i <= n - k; i = i + 1)
{
int distCount = 0;
for (int j = i; j < i + k; j = j + 1)
{
distCount = distCount + 1;
for (int k = i; k < j; k = k + 1)
{
if (X[j] == X[k])
{
distCount = distCount - 1;
break;
}
}
}
output[i] = distCount;
}
return output;
}
Here we are using three nested loops: the first loop runs n - k + 1 times, the second loop runs k times, and the third loop runs j - i times or k times in the worst case. So total number of loop iterations in the worst case = (n - k + 1) * k * k = O(n*k^2). At each step of the iteration, we are doing O(1) operations, so time complexity = O(n*k^2)* O(1) = O(n*k^2)
Space complexity = O(1), we are using constant extra space to generate the output. Think!
Now the critical question is: can we improve the time complexity of the above approach further? In the inner loop, searching is a critical operation to find the duplicate presence of element X[j] in the prefix [i, j-1] of the current window. So, instead of running two loops to find the distinct element, can we apply the idea of binary search by sorting the current window? Let's think!
The idea would be simple: Inside the inner loop, we sort each window and apply binary search on element X[i] to check its unique presence in the prefix of the window from k = i to j - 1. If element X[j] is not present in the window prefix, then we increase the distCount by 1 for that window. Think!
int[] getDistinctCount(int X[], int n, int k)
{
int output[n - k + 1]
for (int i = 0; i <= n - k; i = i + 1)
{
int distCount = 0
sort(X, i, i + k - 1)
for (int j = i, j < i + k; j = j + 1)
{
distCount = distCount + 1
int index = binarySearch(X, i, j - 1, X[j])
if(index != -1)
distCount = distCount - 1
}
output[i] = distCount
}
return output
}
int* getDistinctCount(int X[], int n, int k)
{
int* output = new int[n - k + 1];
for (int i = 0; i <= n - k; i = i + 1)
{
int distCount = 0;
sort(X + i, X + i + k);
for (int j = i; j < i + k; j = j + 1)
{
distCount = distCount + 1;
if (binary_search(X + i, X + j, X[j]))
distCount = distCount - 1;
}
output[i] = distCount;
}
return output;
}
Time complexity = (n - k + 1) * (Time complexity of sorting k size window + Time complexity of searching X[j] using binary search) = (n - k + 1) * (O(k*logk) + k * O(logk)) = O(n) * O(k*logk) = O(n*k*logk)
Space complexity = O(1), we are using constant extra space to generate the output. Think!
Can we further optimize the above approach? Let's think!
After sorting the window in the above approach, all duplicate elements will be placed adjacent to each other. So rather than applying binary search for each window element in a loop, we can count distinct elements using a single loop or single traversal of the window. Here, we can use the idea of the fast and slow pointer used in this blog: Remove duplicates from sorted array.
int* getDistinctCount(int X[], int n, int k)
{
int* output = new int[n - k + 1];
for (int i = 0; i <= n - k; i++)
{
sort(X + i, X + i + k);
int slow = i;
int fast = i + 1;
int distCount = 1;
while (fast < i + k)
{
if (X[slow] != X[fast])
{
distCount = distCount + 1;
slow = slow + 1;
}
fast = fast + 1;
}
output[i] = distCount;
}
return output;
}
Time complexity = (n - k + 1) * (Time complexity of sorting k size window + Time complexity of counting unique elements in the current window) = (n - k + 1) * (O(klogk) + O(k)) = O(n) * O(klogk) = O(n*k*logk). Here time complexity looks similar to the above approach but it is slightly better in terms of optimzation.
Space complexity = O(1), we are using constant extra space to generate the output. Think!
Based on the above approach, sorting contributes a significant factor in the time complexity. Can we think of counting unique elements in a window without sorting? Can we apply the idea of hashing in a window to improve the time complexity of searching distinct elements? Let's think!
The idea is: we traverse each window of size k using a loop and use a hash table to store unique elements of that window. Whenever a window element is not present in the hash table, we insert it there and increase the duplicate count by 1. Think!
int[] getDistinctCount(int X[], int n, int k)
{
int output[n - k + 1]
for(int i = 0; i <= n - k; i = i + 1)
{
hash table H
int distCount = 0
for(int j = i; j < i + k; j = j + 1)
{
if (H.search(X[j]) == false)
{
H.insert(X[j])
distCount = distCount + 1
}
}
output[i] = distCount
}
return output
}
int* getDistinctCount(int X[], int n, int k)
{
int* output = new int[n - k + 1];
for (int i = 0; i <= n - k; i = i + 1)
{
unordered_map<int, bool> H;
int distCount = 0;
for (int j = i; j < i + k; j = j + 1)
{
if (H.count(X[j]) == 0)
{
H[X[j]] = true;
distCount = distCount + 1;
}
}
output[i] = distCount;
}
return output;
}
def getDistinctCount(X, n, k):
output = [0] * (n - k + 1)
for i in range(n - k + 1):
H = {}
distCount = 0
for j in range(i, i + k):
if X[j] not in H:
H[X[j]] = True
distCount = distCount + 1
output[i] = distCount
return output
public class Solution
{
public int[] getDistinctCount(int[] X, int n, int k)
{
int[] output = new int[n - k + 1];
for (int i = 0; i <= n - k; i = i + 1)
{
HashMap<Integer, Boolean> H = new HashMap<>();
int distCount = 0;
for (int j = i; j < i + k; j = j + 1)
{
if (!H.containsKey(X[j]))
{
H.put(X[j], true);
distCount = distCount + 1;
}
}
output[i] = distCount;
}
return output;
}
}
Time complexity = (n - k + 1) * Time complexity of counting unique elements using hash table in k size window = O(n) * k * O(1) = O(n*k). Note: the time complexity of searching and insertion in a hash table would be O(1) average.
Space complexity = O(k), for using the O(k) size hash table to count distinct elements in the k size window.
Now the critical question is can we solve this problem in a single scan without using extra memory? Can we use some additional insight from the problem to get the idea of an efficient solution?
From the observation of the problem, k size windows are placed linearly adjacent to each other. For example, the first window is [0, k-1], the second window is [1, k], the third window is [2, k+1], and so on. If we observe, in any k size window, the k-1 element would be the same as the previous adjacent window i.e. ignoring the first element of the previous window and including one new element at the end of it.
The idea behind a sliding window is: we can easily calculate the unique element count of any current window if we know the unique element count of the previous window. So we find the count of distinct elements in each window of size k by shifting the window by one right and using the computation done in the previous step.
Now the critical question is: how do we track the unique element count of the current window using the unique element count of the previous window?
The idea is: we can use a hash table to store the frequency count of elements in each window. Suppose elements from i to i + k – 1 are stored in a hash table as an element-frequency pair. So for updating the Hash Map in range i + 1 to i + k, we reduce the frequency count of the ith element by one and increase the frequency of (i + k)th element by 1.
int[] getDistinctCount(int X[], int n, int k)
{
int output[n - k + 1]
Hash Table <int, int> freq
int distCount = 0
for (int i = 0; i < k; i = i + 1)
{
if (freq.search(X[i]) == false)
distCount = distCount + 1
freq[X[i]] = freq[X[i]] + 1
}
output[0] = distCount
for (int i = k; i < n; i = i + 1)
{
if (freq[X[i - k]] == 1)
distCount = distCount - 1
freq[X[i - k]] = freq[X[i - k]] - 1
if (freq[X[i]] == 0)
distCount = distCount + 1
freq[X[i]] = freq[X[i]] + 1
output[i - k + 1] = distCount
}
return output
}
int* getDistinctCount(int X[], int n, int k)
{
int* output = new int[n - k + 1];
unordered_map<int, int> freq;
int distCount = 0;
for (int i = 0; i < k; i = i + 1)
{
if (freq.count(X[i]) == 0)
distCount = distCount + 1;
freq[X[i]] = freq[X[i]] + 1;
}
output[0] = distCount;
for (int i = k; i < n; i = i + 1)
{
if (freq[X[i - k]] == 1)
distCount = distCount - 1;
freq[X[i - k]] = freq[X[i - k]] - 1;
if (freq[X[i]] == 0)
distCount = distCount + 1;
freq[X[i]] = freq[X[i]] + 1;
output[i - k + 1] = distCount;
}
return output;
}
def getDistinctCount(X, n, k):
output = []
freq = defaultdict(int)
distCount = 0
for i in range(k):
if X[i] not in freq:
distCount = distCount + 1
freq[X[i]] = freq[X[i]] + 1
output.append(distCount)
for i in range(k, n):
if freq[X[i-k]] == 1:
distCount = distCount - 1
freq[X[i-k]] = freq[X[i-k]] - 1
if X[i] not in freq:
distCount = distCount + 1
freq[X[i]] = freq[X[i]] + 1
output.append(distCount)
return output
public class Solution
{
public int[] getDistinctCount(int[] X, int n, int k)
{
int[] output = new int[n - k + 1];
HashMap<Integer, Integer> freq = new HashMap<>();
int distCount = 0;
for (int i = 0; i < k; i = i + 1)
{
if (!freq.containsKey(X[i]))
distCount = distCount + 1;
freq.put(X[i], freq.getOrDefault(X[i], 0) + 1);
}
output[0] = distCount;
for (int i = k; i < n; i = i + 1)
{
if (freq.get(X[i - k]) == 1)
distCount = distCount - 1;
freq.put(X[i - k], freq.get(X[i - k]) - 1);
if (!freq.containsKey(X[i]))
distCount = distCount + 1;
freq.put(X[i], freq.getOrDefault(X[i], 0) + 1);
output[i - k + 1] = distCount;
}
return output;
}
}
Time complexity = Time complexity of counting unique elements of the first window + Time complexity of counting unique elements of remaining n - k window using sliding window = O(k) + O(n - k) = O(n). Note: the time complexity of searching and insertion in a hash table would be O(1) average.
Space complexity = O(k), for using the O(k) size hash table.
Please write comments if you find an error or you want to share more insights about the topic. Enjoy learning, Enjoy coding, Enjoy algorithms!