|
|
|
|
Reverse
Factor algorithm
The Boyer-Moore type algorithms match some suffixes of the pattern but it is possible to match some prefixes of the pattern by scanning the character of the window from right to left and then improve the length of the shifts. This is made possible by the use of the smallest suffix automaton (also called DAWG for Directed Acyclic Word Graph) of the reverse pattern. The resulting algorithm is called the Reverse Factor algorithm. The
smallest suffix automaton of a word w is a Deterministic Finite
Automaton S(w) =(Q,q0,T,E).
The language accepted by S(w)
is L(S(w))={u
in During the searching phase, the Reverse Factor algorithm parses the characters of the window from right to left with the automaton S(xR), starting with state q0. It goes until there is no more transition defined for the current character of the window from the current state of the automaton. At this moment it is easy to know what is the length of the longest prefix of the pattern which has been matched: it corresponds to the length of the path taken in S(xR) from the start state q0 to the last final state encountered. Knowing the length of this longest prefix, it is trivial to compute the right shift to perform. The
Reverse Factor algorithm has a quadratic worst case time complexity but it is
optimal in average. It performs O(
n.log All the functions to create and manipulate a data structure suitable for a suffix automaton are given in the introduction. void buildSuffixAutomaton(char *x, int m, Graph aut) {
int i, art, init, last, p, q, r;
char c;
init = getInitial(aut);
art = newVertex(aut);
setSuffixLink(aut, init, art);
last = init;
for (i = 0; i < m; ++i) {
c = x[i];
p = last;
q = newVertex(aut);
setLength(aut, q, getLength(aut, p) + 1);
setPosition(aut, q, getPosition(aut, p) + 1);
while (p != init &&
getTarget(aut, p, c) == UNDEFINED) {
setTarget(aut, p, c, q);
setShift(aut, p, c, getPosition(aut, q) -
getPosition(aut, p) - 1);
p = getSuffixLink(aut, p);
}
if (getTarget(aut, p, c) == UNDEFINED) {
setTarget(aut, init, c, q);
setShift(aut, init, c,
getPosition(aut, q) -
getPosition(aut, init) - 1);
setSuffixLink(aut, q, init);
}
else
if (getLength(aut, p) + 1 ==
getLength(aut, getTarget(aut, p, c)))
setSuffixLink(aut, q, getTarget(aut, p, c));
else {
r = newVertex(aut);
copyVertex(aut, r, getTarget(aut, p, c));
setLength(aut, r, getLength(aut, p) + 1);
setSuffixLink(aut, getTarget(aut, p, c), r);
setSuffixLink(aut, q, r);
while (p != art &&
getLength(aut, getTarget(aut, p, c)) >=
getLength(aut, r)) {
setShift(aut, p, c,
getPosition(aut,
getTarget(aut, p, c)) -
getPosition(aut, p) - 1);
setTarget(aut, p, c, r);
p = getSuffixLink(aut, p);
}
}
last = q;
}
setTerminal(aut, last);
while (last != init) {
last = getSuffixLink(aut, last);
setTerminal(aut, last);
}
}
char *reverse(char *x, int m) {
char *xR;
int i;
xR = (char *)malloc((m + 1)*sizeof(char));
for (i = 0; i < m; ++i)
xR[i] = x[m - 1 - i];
xR[m] = '\0';
return(xR);
}
int RF(char *x, int m, char *y, int n) {
int i, j, shift, period, init, state;
Graph aut;
char *xR;
/* Preprocessing */
aut = newSuffixAutomaton(2*(m + 2), 2*(m + 2)*ASIZE);
xR = reverse(x, m);
buildSuffixAutomaton(xR, m, aut);
init = getInitial(aut);
period = m;
/* Searching */
j = 0;
while (j <= n - m) {
i = m - 1;
state = init;
shift = m;
while (i + j >= 0 &&
getTarget(aut, state, y[i + j]) !=
UNDEFINED) {
state = getTarget(aut, state, y[i + j]);
if (isTerminal(aut, state)) {
period = shift;
shift = i;
}
--i;
}
if (i < 0) {
OUTPUT(j);
shift = period;
}
j += shift;
}
}
Preprocessing phase
|
user comments and suggestions are invited at KmailDrive@gmail.com