## Thursday, April 2, 2015

### LeetCode OJ - Repeated DNA Sequences

Problem:

Solution:

At the first sight it isn't so hard at all - just do a brute force check - time limit exceeded. Then I tried to build a Suffix trie - without carefully compressing it - memory limit exceeded.

The real problem with this one is with these two constraints, we really need to have less than quadratic time and memory to get this done.

I could have write a better compressed suffix trie, but instead I just leveraged the fact that the alphabet is small, so I used two bits to represent a character, and therefore 20 bits to represent a substring with 10 characters in length, store them in a map and do lookup.

Care is taken to make sure a substring is not reported twice, and that's it.

Code:

#include "stdafx.h"

// https://leetcode.com/problems/repeated-dna-sequences/

#include "LEET_REPEATED_DNA_SEQUENCES.h"
#include <map>
#include <iostream>
#include <sstream>
#include <vector>
#include <string>

using namespace std;

namespace _LEET_REPEATED_DNA_SEQUENCES
{
class Solution
{
public:
vector<string> findRepeatedDnaSequences(string s)
{
map<int, int> seen_signatures;
vector<string> result;
int current_signature = 0;
for (int i = 0; i < s.length(); i++)
{
current_signature = current_signature << 2;
current_signature = current_signature & 0x000FFFFF;
if (s[i] == 'A')
{
current_signature = current_signature | 0x00000000;
}
else if (s[i] == 'C')
{
current_signature = current_signature | 0x00000001;
}
else if (s[i] == 'T')
{
current_signature = current_signature | 0x00000002;
}
else if (s[i] == 'G')
{
current_signature = current_signature | 0x00000003;
}
if (i >= 9)
{
map<int, int>::iterator probe = seen_signatures.find(current_signature);
if (probe == seen_signatures.end())
{
seen_signatures.insert(pair<int, int>(current_signature, i - 9));
}
else
{
if (probe->second != -1)
{
result.push_back(s.substr(probe->second, 10));
probe->second = -1;
}
}
}
}

return result;
}
};
};

using namespace _LEET_REPEATED_DNA_SEQUENCES;

int LEET_REPEATED_DNA_SEQUENCES()
{
Solution solution;
vector<string> result = solution.findRepeatedDnaSequences("AAAAAAAAAAAA");
for (unsigned int i = 0; i < result.size(); i++)
{
cout << result[i] << endl;
}
return 0;
}