繁体   English   中英

给定长度的共同子序列

[英]Common subsequence of given length

有哪些好方法可以找到两个字符串长度为k所有常见子序列?

例:

s1 = AAGACC

s2 = AGATAACCAGGAGCTGC

所有常见的长度为5的AAGAC AAACCAAGAC AAACC AGACC AAGCC

一种相对简单的方法是从LCS矩阵重建序列。 这是一个O(n ^ 2 * k + x * n)算法,其中x是输出的大小(即长度为k的公共子序列的数量)。 它是在C ++中,但它应该很容易转换为C:

const int N = 100;
int lcs[N][N];
set<tuple<string,int,int,int>> vis;

string s1 = "AAGACC";
string s2 = "AGATAACCAGGAGCTGC";

void reconstruct(const string& res, int i, int j, int k) {
    tuple<string,int,int,int> st(res, i, j, k);
    if (vis.count(st))
        return;
    vis.insert(st);
    if (lcs[i][j] < k) return;
    if (i == 0  && j == 0 && k == 0) {
        cout << res << endl;
        return;
    }
    if (i > 0)
        reconstruct(res, i-1, j, k);
    if (j > 0)
        reconstruct(res, i, j-1, k);
    if (i>0 && j>0 && s1[i-1] == s2[j-1])
        reconstruct(string(1,s1[i-1]) + res, i-1, j-1, k-1);
}

int main() {
    lcs[0][0] = 0;
    for (int i = 0; i <= s1.size(); ++i)
        lcs[i][0] = 0;
    for (int j = 0; j <= s1.size(); ++j)
        lcs[0][j] = 0;
    for (int i = 0; i <= s1.size(); ++i) {
        for (int j = 0; j <= s2.size(); ++j) {
            if (i > 0)
                lcs[i][j] = max(lcs[i][j], lcs[i-1][j]);
            if (j > 0)
                lcs[i][j] = max(lcs[i][j], lcs[i][j-1]);
            if (i > 0 && j > 0 && s1[i-1] == s2[j-1])
                lcs[i][j] = max(lcs[i][j], lcs[i-1][j-1] + 1);
        }
    }
    reconstruct("", s1.size(), s2.size(), 5);
}

基于稍微不同的DP方法,还应该有一种O(n *(k + x))方法来解决这个问题:设f(i,k)为最小索引j ,使得lcs(i,j)> = k 我们已经复发了

f(i, 0) = 0 for all i
f(i, k) = min{f(i-1, k), 
              minimum j > f(i-1, k-1) such that s2[j] = s1[i]}

我们还可以从矩阵f重建长度为k的序列。

s1创建给定长度k的所有子序列的trie ,然后遍历s2并检查长度为k每个序列(如果它在trie中)。

这是使用递归的算法的一个版本,一个大小为k的堆栈,包括两个优化以跳过已经看过的字符并跳过不存在的子子序列。 字符串不是唯一的(可能有重复),因此通过uniq运行输出。

#include <stdio.h>
#include <string.h>

/* s1 is the full first string, s2 is the suffix of the second string
 * (starting after the subsequence at depth r),
 * pos is the positions of chars in s1, r is the recursion depth,
 * and k is the length of subsequences that we are trying to match
 */
void recur(char *s1, char *s2, size_t pos[], size_t r, size_t k)
{
    char seen[256] = {0};       /* have we seen a character in s1 before? */
    size_t p0 = (r == 0) ? 0 : pos[r-1] + 1;    /* start at 0 or pos[r-1]+1 */
    size_t p1 = strlen(s1) - k + r;             /* stop at end of string - k + r */
    size_t p;

    if (r == k)         /* we made it, print the matching string */
    {
        for (p=0; p<k; p++)
            putchar(s1[pos[p]]);
        putchar('\n');
        return;
    }

    for (p=p0; p<=p1; p++)      /* at this pos[], loop through chars to end of string */
    {
        char c = s1[p];         /* get the next char in s1 */
        if (seen[c])
            continue;           /* don't go any further if we've seen this char already */
        seen[c] = 1;

        pos[r] = p;
        char *s = strchr(s2, c);        /* is this char in s2 */
        if (s != NULL)
            recur(s1, s+1, pos, r+1, k);        /* recursively proceed with next char */
    }
}


int main()
{
    char *s1 = "AAGACC";
    char *s2 = "AGATAACCAGGAGCTGC";
    size_t k = 5;
    size_t pos[k];

    if (strlen(s1) < k || strlen(s2) < k)       /* make sure we have at least k chars in each string */
        return 1;       /* exit with error */

    recur(s1, s2, pos, 0, k);
    return 0;
}

输出是:

AAGAC
AAGCC
AAACC
AGACC

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM