infrastructureDataStructure2

题目描述[原题链接][https://leetcode-cn.com/problems/repeated-dna-sequences/]

所有 DNA 由一系列缩写为 A，C，G 和 T 的核苷酸组成，例如：“ACGAATTCCG”。在研究 DNA 时，识别 DNA 中的重复序列有时会对研究非常有帮助。

编写一个函数来查找 DNA 分子中所有出现超多一次的10个字母长的序列（子串）。

示例:

1
2
3

输入: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"

输出: ["AAAAACCCCC", "CCCCCAAAAA"]

算法描述

思路：字符串中只存在四个字符 A，C，G ，T,那么可以用两位的二进制（00，01，10，11）来表示这四个字符，
那么十个字符就对应二十位的二进制，可以转换成对应的二进制表示，

又由于每个字符有四种可能，那么就定义1<<20大小的isExist数组来表示所有的字符是否已存在过，
再定义1<<20大小的isAdd数组来表示所有的字符是否已经添加到结果里面。

1.定义k表示二进制18位都为1的数字，与转换的数字key与运算，可以保留对应数字的后十八位，
也就是对应9个字符，然后再加上下一个字符，就是对应的十个字符的二进制数字，

2.然后根据sExist来判断是否已经存在
若false，则将状态为置为true
若true，则根据isAdd来判断是否已经添加，若false则添加到结果中并置为true。

C++代码1

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        unordered_map<string,int> mp;
        vector<string> v;
        if(s.length()<10)return v;
        for(int i=0;i<=s.length()-10;i++){
            string str = s.substr(i,10);
            if(mp.count(str)==1&&mp[str]==1){
                v.push_back(str);
                mp[str]++;
            }
            mp.insert(pair<string,int>(str,1));
        }
        return v;
    }
};

C++代码2

class Solution {
public:
    
    inline int encode(char pChar, int pDNA) {
        int aTwoBit = 0;
        if (pChar == 'T') { aTwoBit = 1; }
        if (pChar == 'C') { aTwoBit = 2; }
        if (pChar == 'G') { aTwoBit = 3; }
        return ((pDNA << 2) & 0xFFFFF) | aTwoBit;
    }
    
    char *mBuffer;
    inline string decode(int pDNA) {
        int aTwoBit, aVal;
        int aIndex = 9;
        while (aIndex >= 0) {
            aTwoBit = pDNA & 0x3;
            if (aTwoBit == 1) {
                mBuffer[aIndex] = 'T';
            } else if (aTwoBit == 2) {
                mBuffer[aIndex] = 'C';
            } else if (aTwoBit == 3) {
                mBuffer[aIndex] = 'G';
            } else {
                mBuffer[aIndex] = 'A';
            }
            pDNA = (pDNA >> 2);
            aIndex -= 1;
        }
        string aResult = string(mBuffer);
        return aResult;
    }
    
    
    vector<string> findRepeatedDnaSequences(string s) {
        
        vector<string> aResult;
        if (s.size() <= 10) { return aResult; }
        
        mBuffer = new char[11];
        mBuffer[10] = 0;
        
        unordered_set<int> aVisitedOnce;
        unordered_set<int> aVisitedTwice;
        
        int i = 0;
        int aCode = 0;
        
        for (i=0;i<9;i++) { aCode = encode(s[i], aCode); }
        
        for (;i<s.size();i++) {
            aCode = encode(s[i], aCode);
            if (aVisitedOnce.find(aCode) != aVisitedOnce.end()) {
                if (aVisitedTwice.find(aCode) == aVisitedTwice.end()) {
                    aResult.push_back(decode(aCode));
                    aVisitedTwice.insert(aCode);
                }
            } else {
                aVisitedOnce.insert(aCode);
            }
        }
        
        delete [] mBuffer;
        mBuffer = NULL;
        
        return aResult;
    }
};

Java代码

class Solution {
    public List<String> findRepeatedDnaSequences(String s) {
        List<String> list = new ArrayList<String>();
        boolean[] isExist = new boolean[1<<20];
        boolean[] isAdd = new boolean[1<<20];
        int k = (1<<18)-1,key = 0;
        for(int i=0;i<s.length();i++){
            key<<=2;
            key+=getValue(s.charAt(i));
            if(i>=9){
                if(isExist[key]){
                    if(!isAdd[key]){
                        isAdd[key] = true;
                        list.add(s.substring(i-9,i+1));
                    }
                }
                else{
                    isExist[key] = true;
                }
                key&=k;
            }
        }
        return list;
    }

    private int getValue(char c){
        switch(c){
            case 'A':
                return 0;
            case 'C':
                return 1;
            case 'G':
                return 2;
            case 'T':
                return 3;
            default:
                throw new IllegalArgumentException("lllegal character");
        }
    }
}