Page 1 of 1

Suffix Array

Posted: Wed Nov 07, 2012 10:03 pm
by EduardoQuintana
I found this document refering to Suffix Array (SA):
"(appeared in GInfo 15/7, November 2005) Suffix arrays – a programming contest approach Adrian Vladu and Cosmin Negruşeri"
The running time for this algorithm is O(n log^2 n).

It seems to me, problem "Casting Spells" (at "http://coj.uci.cu/24h/problem.xhtml?abb=2073") can be solved using SA.
I implemented the article's SA, but I got a "Time Limit Exceded".

Questions:
-Is it correct for this problem to use (given the time limits) SA? or there is another way?
-If it is correct to use SA in this problem: then how can I improve it?

Re: Suffix Array

Posted: Mon Nov 12, 2012 4:16 pm
by ymondelo20
You should know that there is an implementation practically O(n) for SA. Then, yes, you can improve your solution.
Grettings

Re: Suffix Array

Posted: Mon Apr 29, 2013 4:03 pm
by Spartan
would you write that efficient solution for us?

Re: Suffix Array

Posted: Mon Apr 29, 2013 4:40 pm
by ymondelo20

Code: Select all

//------------------------------- 2250 Substring Frequency - Using a Suffix Array (MAXN*K) + LCP
#define INVCICLE(i,a,b,c) for(int i = (a); i >= (b); i -= (c))
#define CICLE(i,a,b,c) for(int i=(a); i<=(b); i += (c))
#include <algorithm>
#define MAXN 2000002
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
inline bool leq(int a1, int a2, int b1, int b2) // lexicographic order
{ return(a1 < b1 || a1 == b1 && a2 <= b2); }                 // for pairs
inline bool leq(int a1, int a2, int a3, int b1, int b2, int b3)
{ return(a1 < b1 || a1 == b1 && leq(a2,a3, b2,b3)); }        // and triples
// stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r
static void radixPass(int* a, int* b, int* r, int n, int K)
{ // count occurrences
  int* c = new int[K + 1];                          // counter array
  for(int i = 0; i <= K; i++) c[i] = 0;            // reset counters
  for(int i = 0; i < n; i++) c[r[a[i]]]++;      // count occurrences
  for(int i = 0, sum = 0; i <= K; i++)      // exclusive prefix sums
  { int t = c[i]; c[i] = sum; sum += t; }
  for(int i = 0; i < n; i++) b[c[r[a[i]]]++] = a[i];         // sort
  delete [] c;
}
// find the suffix array SA of T[0..n-1] in {1..K}^n
// require T[n]=T[n+1]=T[n+2]=0, n>=2
void suffixArray(int* T, int* SA, int n, int K)
{
  int n0 = (n+2)/3, n1 = (n+1)/3, n2 = n/3, n02 = n0 + n2;
  int* R = new int[n02 + 3]; R[n02] = R[n02+1] = R[n02+2] = 0;
  int* SA12 = new int[n02 + 3]; SA12[n02] = SA12[n02+1] = SA12[n02+2] = 0;
  int* R0   = new int[n0];
  int* SA0 = new int[n0];

  //******* Step 0: Construct sample ********
  // generate positions of mod 1 and mod 2 suffixes
  // the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1
  for(int i=0, j=0; i < n+(n0-n1); i++) if(i%3 != 0) R[j++] = i;

  //******* Step 1: Sort sample suffixes ********
  // lsb radix sort the mod 1 and mod 2 triples
  radixPass(R,  SA12, T+2, n02, K);
  radixPass(SA12, R , T+1, n02, K);
  radixPass(R,  SA12, T  , n02, K);
  // find lexicographic names of triples and
  // write them to correct places in R
  int name = 0, c0 = -1, c1 = -1, c2 = -1;
  for(int i = 0; i < n02; i++)
  {
   if(T[SA12[i]] != c0 || T[SA12[i]+1] != c1 || T[SA12[i]+2] != c2)

   { name++; c0 = T[SA12[i]]; c1 = T[SA12[i]+1]; c2 = T[SA12[i]+2]; }
   if(SA12[i] % 3 == 1) { R[SA12[i]/3]      = name; } // write to R1
   else                 { R[SA12[i]/3 + n0] = name; } // write to R2
  }
  // recurse if names are not yet unique
  if(name < n02)
  {
   suffixArray(R, SA12, n02, name);
   // store unique names in R using the suffix array
   for(int i = 0; i < n02; i++) R[SA12[i]] = i + 1;
  }
  else // generate the suffix array of R directly
   for(int i = 0; i < n02; i++) SA12[R[i] - 1] = i;

  //******* Step 2: Sort nonsample suffixes ********
  // stably sort the mod 0 suffixes from SA12 by their first character
  for(int i=0, j=0; i < n02; i++) if(SA12[i] < n0) R0[j++] = 3*SA12[i];
  radixPass(R0, SA0, T, n0, K);

  //******* Step 3: Merge ********
  // merge sorted SA0 suffixes and sorted SA12 suffixes
  #define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2)
  for(int p=0, t=n0-n1, k=0; k < n; k++)
  {
   int i = GetI(); // pos of current offset 12 suffix
   int j = SA0[p]; // pos of current offset 0 suffix
   if(SA12[t] < n0 ? // different compares for mod 1 and mod 2 suffixes
   leq(T[i],         R[SA12[t] + n0], T[j],        R[j/3]) :
   leq(T[i], T[i+1], R[SA12[t]-n0+1], T[j], T[j+1],R[j/3+n0]))
   {// suffix from SA12 is smaller
    SA[k] = i; t++;
    if(t == n02) // done --- only SA0 suffixes left
    for(k++; p < n0; p++, k++) SA[k] = SA0[p];
   }
   else
   {// suffix from SA0 is smaller
    SA[k] = j; p++;
    if(p == n0) // done --- only SA12 suffixes left
    for(k++; t < n02; t++, k++) SA[k] = GetI();
   }
  }
  delete [] R; delete [] SA12; delete [] SA0; delete [] R0;
}

void findLCP(int* T, int* SA, int* rank, int* lcp, int N)
{
 int i, j, k=0;
 CICLE(p,1,N,1) rank[SA[p]]=p;
 for(i=0; i<N; lcp[rank[i++]]=k)
 for(k ? k-- : 0, j=SA[rank[i]-1]; T[i+k]==T[j+k]; k++);
}

int T, LA, LB, N, K, A[MAXN], SA[MAXN], RANK[MAXN], LCP[MAXN];
char cad[2*MAXN], cad2[MAXN];
int main()
{
 scanf("%d",&T);
 CICLE(t,1,T,1)
 {
  scanf("%s%s",&cad, cad2);
  LA = strlen(cad); LB = strlen(cad2);
  strcat(cad,"#"); strcat(cad,cad2); N = strlen(cad);
  CICLE(p,0,N-1,1) A[p] = cad[p]; A[N] = A[N+1] = A[N+2] = 0;
  suffixArray(A, SA, N+1, K = 260); findLCP(A, SA, RANK, LCP, N);
  //printf("%s\n",cad); CICLE(p,1,N,1) printf("%d %d\n",SA[p],LCP[p]);
  int pos = 1, res = 0; while(SA[pos] != LA+1) pos++;
  while(pos < N && LCP[pos+1] >= LB) pos++, res++;
  printf("Case %d: %d\n",t,res);
 }
}
This is submission http://coj.uci.cu/24h/submission.xhtml?id=369018 (with TLE answer, for that problem).

Best Regards, and bon appetit... :D

Re: Suffix Array

Posted: Sun Dec 07, 2014 10:29 am
by HaZard
tengo una duda y no sé si el Suffix Array sea la solucion:
dada una cadena, ¿puede calcularse en buen tiempo cuantas veces aparece cada sufijo de dicha cadena en la original?

Re: Suffix Array

Posted: Sat Jan 17, 2015 6:09 pm
by ymondelo20
Puede servir pero no es muy eficiente digamos.
Para ese tipo de problemas es más conveniente explotar el KMP en realidad.