## Tuesday, July 7, 2015

### LeetCode OJ - Regular Expression Matching

Problem:

Solution:

Standard algorithm based on subset construction. Potentially exponential time processing the pattern, but linear matching time.

In typical subset construction we needed to map a set of NFA nodes into a single DFA node. In this implementation, that map is realized as a custom Trie. That gives a lookup time independent of the number of DFA nodes.

A fairly complicated piece of code - a couple tricks worth notice:

In the pattern, '.' means any character.

In the NFA, '.' means any character other than the ones already in the string.

In the DFA. '\0' represents any character. There is no escaping mechanism in the input so the literal '.' can never appear in the pattern, but it could be in the matching string.

Too complicated? I thought about backtracking and submitted a few times, there are test cases that specially designed to kill brute force backtracking based solution. In the forum, it was mentioned that there exist a dynamic programming based solution that runs in quadratic time. I haven't look into that.

Code:

#include "stdafx.h"

// https://leetcode.com/problems/regular-expression-matching/

#include "LEET_REGULAR_EXPRESSION_MATCHING.h"
#include <set>
#include <map>
#include <list>
#include <queue>
#include <iostream>
#include <sstream>
#include <vector>
#include <string>

using namespace std;

namespace _LEET_REGULAR_EXPRESSION_MATCHING
{
class Solution
{
private:
class NfaEdge
{
public:
NfaEdge(int from, int to, char c) : m_from(from), m_to(to), m_c(c)
{
}

int m_from;
int m_to;
char m_c;
};

class TrieNode
{
public:
TrieNode() : m_value(-1)
{
}

~TrieNode()
{
for (map<int, TrieNode*>::iterator mci = this->m_children.begin(); mci != this->m_children.end(); mci++)
{
delete mci->second;
}
}
map<int, TrieNode*> m_children;
int m_value; // -1 represents a non-leaf
};

int get_epsilon_closure(set<int>& seeds, list<pair<char, int>>* nfa_adjacent_maps, TrieNode* subsetsRoot, map<int, set<int>>* subsets, int* currentDfaNode)
{
// Step 1: Find the epsilon_closure
queue<int> bfs_queue;
set<int> visited;
for (set<int>::iterator si = seeds.begin(); si != seeds.end(); si++)
{
visited.insert(*si);
bfs_queue.push(*si);
}
while (bfs_queue.size() > 0)
{
int current = bfs_queue.front();
bfs_queue.pop();
{
if (nami->first == '\0')
{
if (visited.find(nami->second) == visited.end())
{
visited.insert(nami->second);
bfs_queue.push(nami->second);
}
}
}
}

// Step 2: Find or insert in the trie
TrieNode* currentTrieNode = subsetsRoot;
for (set<int>::iterator vi = visited.begin(); vi != visited.end(); vi++)
{
int currentNfaNode = *vi;
map<int, TrieNode*>::iterator probe = currentTrieNode->m_children.find(currentNfaNode);
if (probe == currentTrieNode->m_children.end())
{
TrieNode* newTrieNode = new TrieNode();
currentTrieNode->m_children.insert(pair<int, TrieNode*>(currentNfaNode, newTrieNode));
currentTrieNode = newTrieNode;
}
else
{
currentTrieNode = probe->second;
}
}

if (currentTrieNode->m_value == -1)
{
currentTrieNode->m_value = ++(*currentDfaNode);
subsets->insert(pair<int, set<int>>(currentTrieNode->m_value, visited));
}

return currentTrieNode->m_value;
}

public:
bool isMatch(string s, string p)
{
int nfaNodeId = 0;
set<int> characters;

// Step 0: Construct the set of all characters
for (unsigned int i = 0; i < p.length(); i++)
{
char pc = p[i];
if (pc != '.')
{
set<int>::iterator probe = characters.find(pc);
if (probe == characters.end())
{
characters.insert(pc);
}
}
}

// Step 1: Construct the NFA
list<NfaEdge> nfa_edges;
int prevNodeId= 0;
for (unsigned int i = 0; i < p.length(); i++)
{
char pc = p[i];
bool star = false;
if (i != p.length() - 1)
{
star = p[i + 1] == '*';
}

int nextNodeId = ++nfaNodeId;

nfa_edges.push_back(NfaEdge(prevNodeId, nextNodeId, pc));
if (pc == '.')
{
for (set<int>::iterator ci = characters.begin(); ci != characters.end(); ci++)
{
nfa_edges.push_back(NfaEdge(prevNodeId, nextNodeId, *ci));
}
}

if (star)
{
nfa_edges.push_back(NfaEdge(prevNodeId, nextNodeId, '\0'));
nfa_edges.push_back(NfaEdge(nextNodeId, prevNodeId, '\0'));
nextNodeId = ++nfaNodeId;
nfa_edges.push_back(NfaEdge(nextNodeId - 1, nextNodeId, '\0'));
i++;

}
prevNodeId = nextNodeId;
}

int numNfaNodes = nfaNodeId + 1;

// Step 2: To adjacency sets
list<pair<char, int>>* nfa_adjacent_maps = new list<pair<char, int>>[numNfaNodes];
for (list<NfaEdge>::iterator nei = nfa_edges.begin(); nei != nfa_edges.end(); nei++)
{
}

// Step 3: Convert to a DFA using subset construction
int currentDfaNode = -1;
TrieNode* subsetsRoot = new TrieNode();
map<int, set<int>> subsets;
set<int> rootSeed;
rootSeed.insert(0);
int rootNode = get_epsilon_closure(rootSeed, nfa_adjacent_maps, subsetsRoot, &subsets, &currentDfaNode);

map<int, map<char, int>> dfa;

queue<int> bfs_queue;
set<int> visited;
bfs_queue.push(rootNode);
visited.insert(rootNode);
while (bfs_queue.size() > 0)
{
int current_node = bfs_queue.front();
bfs_queue.pop();
set<int>& subset = subsets[current_node];

// Step 3.1: Create the branches
map<char, set<int>> branch;
for (set<int>::iterator si = subset.begin(); si != subset.end(); si++)
{
int current_nfa_node = *si;
{
if (nami->first != '\0')
{
map<char, set<int>>::iterator probe1 = branch.find(nami->first);
if (probe1 == branch.end())
{
branch.insert(pair<char, set<int>>(nami->first, set<int>()));
}
set<int>& branchSet = branch[nami->first];
set<int>::iterator probe2 = branchSet.find(nami->second);
if (probe2 == branchSet.end())
{
branchSet.insert(nami->second);
}
}
}
}

// Step 3.2: Branches to the eplison closure of them
for (map<char, set<int>>::iterator bi = branch.begin(); bi != branch.end(); bi++)
{
int branchTargetDfaNode = get_epsilon_closure(bi->second, nfa_adjacent_maps, subsetsRoot, &subsets, &currentDfaNode);

map<int, map<char, int>>::iterator probe1 = dfa.find(current_node);
if (probe1 == dfa.end())
{
dfa.insert(pair<int, map<char, int>>(current_node, map<char, int>()));
}
map<int, map<char, int>>::iterator probe2 = dfa.find(branchTargetDfaNode);
if (probe2 == dfa.end())
{
dfa.insert(pair<int, map<char, int>>(branchTargetDfaNode, map<char, int>()));
}

char bc = bi->first;
if (bc == '.')
{
bc = '\0'; // In the DFA, we use '\0' to represent the other characters
}

dfa[current_node].insert(pair<char, int>(bc, branchTargetDfaNode));

if (visited.find(branchTargetDfaNode) == visited.end())
{
visited.insert(branchTargetDfaNode);
bfs_queue.push(branchTargetDfaNode);
}
}
}

delete subsetsRoot;

// Step 4: Run the DFA for the answer
int currentState = 0;
for (unsigned int si = 0; si < s.length(); si++)
{
char current_character = s[si];
if (characters.find(current_character) == characters.end())
{
current_character = '\0'; // This is an 'other' character, see if there is an allowed branch in the DFA
}
map<char, int>::iterator probe =  dfa[currentState].find(current_character);
if (probe == dfa[currentState].end())
{
return false;
}
else
{
currentState = probe->second;
}
}

return subsets[currentState].find(numNfaNodes - 1) != subsets[currentState].end();
}
};
};

using namespace _LEET_REGULAR_EXPRESSION_MATCHING;

int LEET_REGULAR_EXPRESSION_MATCHING()
{
Solution s;
cout << (s.isMatch("aa","a") == 0) << endl;
cout << s.isMatch("aa","aa") << endl;
cout << (s.isMatch("aaa","aa") == 0) << endl;
cout << s.isMatch("aa", "a*") << endl;
cout << s.isMatch("aa", ".*") << endl;
cout << s.isMatch("ab", ".*") << endl;
cout << s.isMatch("aab", "c*a*b") << endl;
cout << (s.isMatch("ab", ".*c") == 0)   << endl;
cout << (s.isMatch("aaa", "ab*a") == 0) << endl;

return 0;
}