使用C獲取文件中每一行的長度並寫入輸出文件

Question

我是一名生物學專業的學生，我試圖學習perl，python和C，並且還在我的工作中使用這些腳本。 因此，我有一個文件如下：

>sequence1
ATCGATCGATCG
>sequence2
AAAATTTT
>sequence3
CCCCGGGG

輸出應如下所示，即每個序列的名稱以及每一行中的字符數，並在文件末尾打印序列的總數。

sequence1 12
sequence2 8
sequence3 8
Total number of sequences = 3

我可以使perl和python腳本正常工作，以python腳本為例：

#!/usr/bin/python

import sys

my_file = open(sys.argv[1]) #open the file
my_output = open(sys.argv[2], "w") #open output file

total_sequence_counts = 0

for line in my_file: 
    if line.startswith(">"):
        sequence_name = line.rstrip('\n').replace(">","") 
        total_sequence_counts += 1 
        continue    
    dna_length = len(line.rstrip('\n')) 
    my_output.write(sequence_name + " " + str(dna_length) + '\n')
my_output.write("Total number of sequences = " + str(total_sequence_counts) + '\n')

現在，我想用C編寫相同的腳本，這是我到目前為止已經實現的：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char *argv[])
{
input = FILE *fopen(const char *filename, "r");
output = FILE *fopen(const char *filename, "w"); 

double total_sequence_counts = 0;
char sequence_name[];

char line [4095]; // set a temporary line length
char buffer = (char *) malloc (sizeof(line) +1); // allocate some memory

while (fgets(line, sizeof(line), filename) != NULL) { // read until new line character is not found in line

    buffer = realloc(*buffer, strlen(line) + strlen(buffer) + 1); // realloc buffer to adjust buffer size
    if (buffer == NULL) { // print error message if memory allocation fails
        printf("\n Memory error");
        return 0;
    }
    if (line[0] == ">") {
        sequence_name = strcpy(sequence_name, &line[1]); 
        total_sequence_counts += 1
        }
        else {
            double length = strlen(line);
            fprintf(output, "%s \t %ld", sequence_name, length);
        }
    fprintf(output, "%s \t %ld", "Total number of sequences = ", total_sequence_counts);
}
    int fclose(FILE *input); // when you are done working with a file, you should close it using this function. 
    return 0;
    int fclose(FILE *output);
    return 0;
}

但是這段代碼當然充滿了錯誤，我的問題是，盡管學習了很多，但我仍然無法正確理解和使用內存分配和指針，因此我知道我在那部分尤其有錯誤。 如果您可以對我的代碼發表評論，看看它如何變成可以實際運行的腳本，那就太好了。 順便說一句，在我的實際數據中，每行的長度都沒有定義，因此我需要為此使用malloc和realloc。

Answer 1

對於像這樣的簡單程序，您一次只看幾行，就不必擔心動態內存分配。 使用合理大小的本地緩沖區可能已經足夠了。

另一件事是，C不特別適合於快速和骯臟的字符串處理。 例如，標准庫中沒有strstrip函數。 您通常最終自己實現這種行為。

一個示例實現如下所示：

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>



#define MAXLEN 80       /* Maximum line length, including null terminator */

int main(int argc, char *argv[])
{
    FILE *in;
    FILE *out;

    char line[MAXLEN];          /* Current line buffer */
    char ref[MAXLEN] = "";      /* Sequence reference buffer */
    int nseq = 0;               /* Sequence counter */

    if (argc != 3) {
        fprintf(stderr, "Usage: %s infile outfile\n", argv[0]);
        exit(1);
    }

    in = fopen(argv[1], "r");
    if (in == NULL) {
        fprintf(stderr, "Couldn't open %s.\n", argv[1]);
        exit(1);        
    }

    out = fopen(argv[2], "w");
    if (in == NULL) {
        fprintf(stderr, "Couldn't open %s for writing.\n", argv[2]);
        exit(1);        
    }

    while (fgets(line, sizeof(line), in)) {
        int len = strlen(line);

        /* Strip whitespace from end */
        while (len > 0 && isspace(line[len - 1])) len--;
        line[len] = '\0';

        if (line[0] == '>') {
            /* First char is '>': copy from second char in line */
            strcpy(ref, line + 1);
        } else {
            /* Other lines are sequences */
            fprintf(out, "%s: %d\n", ref, len);
            nseq++;
        }
    }

    fprintf(out, "Total number of sequences. %d\n", nseq);

    fclose(in);
    fclose(out);

    return 0;
}

許多代碼是關於強制參數以及打開和關閉文件的。 （如果您將stdin和stdout用於文件重定向，則可能會削減很多代碼。）

核心是大的while循環。 注意事項：

如果發生錯誤或到達文件末尾， fgets將返回NULL 。
前幾行確定行的長度，然后從末尾刪除空白。
減少長度是不夠的，最后剝離的字符串必須以空字符'\\0'終止
當您檢查該行中的第一個字符時，應檢查一個字符，而不是字符串。 在C中，單引號和雙引號不可互換。 ">"是由兩個字符'>'和結束的'\\0'的字符串文字。
處理字符串中的char等可數實體時，請使用整數類型，而不要使用浮點數。 （我在這里使用（簽名） int ，但是因為一行中不能有負數個字符，所以使用無符號類型可能會更好。）
注釋line + 1等效於&line[1] 。
我顯示的代碼不會檢查每個序列始終有一個引用。 我會將其作為練習留給讀者。

對於初學者來說，這可能是很多需要跟蹤的地方。 對於像您這樣的小型文本處理任務，Python和Perl絕對更適合。

編輯：上面的解決方案不適用於長序列； 僅限MAXLEN字符。 但是，如果只需要長度而不是序列的內容，則不需要動態分配。

這是一個不讀取行，而是讀取字符的更新版本。 在'>'上下文中，它存儲了引用。 否則，它只會保持計數：

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>      /* for isspace() */



#define MAXLEN 80       /* Maximum line length, including null terminator */

int main(int argc, char *argv[])
{
    FILE *in;
    FILE *out;

    int nseq = 0;               /* Sequence counter */
    char ref[MAXLEN];           /* Reference name */

    in = fopen(argv[1], "r");
    out = fopen(argv[2], "w");

    /* Snip: Argument and file checking as above */

    while (1) {
        int c = getc(in);

        if (c == EOF) break;

        if (c == '>') {
            int n = 0;

            c = fgetc(in);
            while (c != EOF && c != '\n') {
                if (n < sizeof(ref) - 1) ref[n++] = c;
                c = fgetc(in);
            }
            ref[n] = '\0';
        } else {
            int len = 0;
            int n = 0;

            while (c != EOF && c != '\n') {
                n++;
                if (!isspace(c)) len = n;
                c = fgetc(in);
            }

            fprintf(out, "%s: %d\n", ref, len);
            nseq++;
        }
    }

    fprintf(out, "Total number of sequences. %d\n", nseq);

    fclose(in);
    fclose(out);

    return 0;
}

筆記：

fgetc從文件讀取一個字節，並在文件結束時返回此字節或EOF 。 在此實現中，這是唯一使用的閱讀功能。
存儲引用字符串也是通過fgetc在這里實現的。 您也可以在跳過初始尖括號后使用fgets 。
計數只是讀取字節而不存儲它們。 n是總計數， len是直到最后一個非空格的計數。 （您的行可能僅由ACGT組成，沒有任何尾隨空格，因此您可以跳過空格測試，並使用n代替len 。）

Answer 2

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char *argv[]){
    FILE *my_file = fopen(argv[1], "r");
    FILE *my_output = fopen(argv[2], "w");
    int total_sequence_coutns = 0;
    char *sequence_name;
    int dna_length;
    char *line = NULL;
    size_t size = 0;

    while(-1 != getline(&line, &size, my_file)){
        if(line[0] == '>'){
            sequence_name = strdup(strtok(line, ">\n"));
            total_sequence_coutns +=1;
            continue;
        }
        dna_length = strlen(strtok(line, "\n"));
        fprintf(my_output, "%s %d\n", sequence_name, dna_length);
        free(sequence_name);
    }
    fprintf(my_output, "Total number of sequences = %d\n", total_sequence_coutns);
    fclose(my_file);
    fclose(my_output);
    free(line);
    return (0);
}

使用C獲取文件中每一行的長度並寫入輸出文件

問題描述

2 個解決方案

解決方案1
3 已采納 2014-11-12 11:35:30

解決方案2
0 2014-11-12 11:25:59

使用C獲取文件中每一行的長度並寫入輸出文件

問題描述

2 個解決方案

解決方案1 3 已采納 2014-11-12 11:35:30

解決方案2 0 2014-11-12 11:25:59

解決方案1
3 已采納 2014-11-12 11:35:30

解決方案2
0 2014-11-12 11:25:59