Tuesday, February 17, 2015

Repeated DNA Sequences

Repeated DNA Sequences
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",

Return:
["AAAAACCCCC", "CCCCCAAAAA"].

---------------------------------------- codes -----------------------------------------------------
class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> result;
        unordered_map<int, int> map;
        if (s.size() < 10) {
            return result;
        }
        for (int i = 0; i <= s.size() - 10; i++) {
            int key = getDnaKey(s, i, 10);
            if (map[key]) {
                //bug here
                //if (map[key] > 1) {
                if (map[key] == 1) {
                    result.push_back(s.substr(i, 10));
                }
                //} else {
                //    map[key] += 1;
                //}
                map[key] += 1;
            } else {
                map[key] = 1;
            }
        }
        return result;
    }
    int getDnaKey(string s, int index, int length) {
        int key = 0;
        if (s.size() < index + length) {
            return -1;
        }
        for (int i = index; i < index + length; i++) {
            //bug here
            //key = key * 4 + s[i] & 3;
            key = key * 8 + (((int)(s[i] - 'A')) & 7);
        }
        return key;
    }
};

----------------------------- better codes --------------------------------------------------------------------
class Solution {
    inline bool testAndSetBit(unsigned int *intArr, unsigned int offset) {
        unsigned int byteOff = offset >> 5;
        unsigned int bitOff = offset & ((1 << 5) - 1);

        unsigned int bit = intArr[byteOff] & (1 << bitOff);
        if (bit) {
            return true;
        } else {
            intArr[byteOff] |= 1 << bitOff;
            return false;
        }
    }
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> result;
        if (s.length() < 11) {
            return result;
        }

        static const int intArrSize = 1 << (2 * 10 - 5);
        unsigned int dataArr[intArrSize] = {0};
        unsigned int recordArr[intArrSize] = {0};

        const char *str = s.c_str();
        int size = s.length();
        static const unsigned int mask = ((1 << 20) - 1);

        unsigned int data = 0;
        for (int i = 0; i < 10; ++i) {
            unsigned int twobits = (str[i] >> 1) & 0x3;
            twobits <<= ((9 - i) << 1);
            data |= twobits;
        }
        testAndSetBit(dataArr, data);

        char dna[11] = { 0 };
        for (int i = 10; i < size; ++i) {
            unsigned int twobits = (str[i] >> 1) & 0x3;

            data = ((data << 2) | twobits) & mask;
            bool hasSet = testAndSetBit(dataArr, data);
            if (!hasSet) {
                continue;
            }

            hasSet = testAndSetBit(recordArr, data);
            if (hasSet) {
                continue;
            }
            memcpy(dna, str + i - 9, 10);
            result.push_back(dna);
        }

        return result;
    }
};

No comments:

Post a Comment