// fasta454.cpp : Defines the entry point for the console application.
// ./fasta_454 mg1655_2_1.txt 50 > mg1655_2_1_out.txt

//#include "stdafx.h"
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>

// Program designed by Alex E. Pozhitkov, 2011, University of Washington
// pozhit@washington.edu

//#define SUPERCOMP

#if defined SUPERCOMP
#include <cstring>
#endif

using namespace std;

struct Seq454
{
	string s_ID;
	int i_Rank;
	float f_x;
	float f_y;
	string s_Sequence;
	string s_OppositeStrand;
	bool bRedundantDirect;
	bool bRedundantOpposite;
	bool bHeadsMatch;
};

ostream& ReportCoOccurrence(const Seq454& BiggerSeq, const Seq454& SmallerSeq, int idPos, int iOppos)
{
	return cout << '\n' << BiggerSeq.s_ID << '\t' << BiggerSeq.s_Sequence << '\t' << SmallerSeq.s_ID << '\t' <<SmallerSeq.s_Sequence << '\t' << SmallerSeq.s_OppositeStrand << '\t' << idPos << '\t' << iOppos;
}

int main(int argc, char* argv[])
{
	string str;
	vector<Seq454>  AllSequences;

	Seq454 s_454individ;
	ifstream InputFile;
	int ibpLimit = 0;
	if(argc != 3)
		throw 1;
	sscanf(argv[2], "%d", &ibpLimit);

	InputFile.open(argv[1]);
	while(!InputFile.eof())
	{
		getline(InputFile, str);
		if(str == "") continue;
		if(str.find('>') != string::npos)
		{
			//this is a header of a new sequence
			s_454individ.s_OppositeStrand = "";
			s_454individ.s_OppositeStrand.resize(s_454individ.s_Sequence.length());
			string::iterator fops = s_454individ.s_OppositeStrand.begin();
			for (string::reverse_iterator ri = s_454individ.s_Sequence.rbegin(); ri < s_454individ.s_Sequence.rend(); ri++)
			{
				//making second strand
				switch (*ri)
				{
				case 'A':
					*fops++ = 'T';
					break;
				case 'T':
					*fops++ = 'A';
					break;
				case 'G':
					*fops++ = 'C';
					break;
				case 'C':
					*fops++ = 'G';
					break;
				default:
					*fops++ = 'N';
				}
			}
			s_454individ.bHeadsMatch = false;
			s_454individ.bRedundantDirect = false;
			s_454individ.bRedundantOpposite = false;
			AllSequences.push_back(s_454individ);  //save previous seq
			s_454individ.s_Sequence = "";
			int pos1;
			pos1 = str.find_first_of(' ');
			s_454individ.s_ID = str.substr(1, pos1 - 1);
			pos1 = str.find_first_of("rank=");
			sscanf((str.substr(pos1+5)).c_str(), "%d", &s_454individ.i_Rank);
			pos1 = str.find_first_of("x=");
			sscanf((str.substr(pos1+2)).c_str(), "%f", &s_454individ.f_x);
			pos1 = str.find_first_of("y=");
			sscanf((str.substr(pos1+2)).c_str(), "%f", &s_454individ.f_y);
		}
		else s_454individ.s_Sequence += str;
	}

	AllSequences.push_back(s_454individ);  //save last record
	AllSequences.erase(AllSequences.begin());  // remove first empty record
	cout << "BiggerSeqID\tBiggerSeq\tCooccurredSmallerID\tCooccurredSmallerSeq\tCooccurredSmallerOppositeStrand\tPosDirect\tPosOpposite";

	for (int i = 0; i < AllSequences.size(); i++)
	{
		cerr << '\r' << (float) i / AllSequences.size()<<"\t\t";
		if(AllSequences[i].s_Sequence.length() < ibpLimit)
			continue;  //sequence too short

		for (int j = i+1; j < AllSequences.size(); j++)
		{
			int iPosDirect, iPosOpposite;
			if (AllSequences[i].s_Sequence.length() > AllSequences[j].s_Sequence.length())
			{
				iPosDirect = AllSequences[i].s_Sequence.find(AllSequences[j].s_Sequence);
				iPosOpposite = AllSequences[i].s_Sequence.find(AllSequences[j].s_OppositeStrand);
				if((iPosDirect != string::npos) || ( iPosOpposite != string::npos))
				{
					ReportCoOccurrence(AllSequences[i], AllSequences[j], iPosDirect, iPosOpposite);
					AllSequences[i].bRedundantDirect |= (iPosDirect != string::npos);
					AllSequences[i].bRedundantOpposite |= ( iPosOpposite != string::npos);
					AllSequences[j].bRedundantDirect |= (iPosDirect != string::npos);
					AllSequences[j].bRedundantOpposite |= ( iPosOpposite != string::npos);
					AllSequences[i].bHeadsMatch = (iPosDirect == 0) || (iPosOpposite == 0);
					AllSequences[j].bHeadsMatch = AllSequences[i].bHeadsMatch;
					//break;  //exaustive search if commented
				}
			}
			else
			{
				iPosDirect = AllSequences[j].s_Sequence.find(AllSequences[i].s_Sequence);
				iPosOpposite = AllSequences[j].s_Sequence.find(AllSequences[i].s_OppositeStrand);
				if((iPosDirect != string::npos) || ( iPosOpposite != string::npos))
				{
					ReportCoOccurrence(AllSequences[j], AllSequences[i], iPosDirect, iPosOpposite);
					AllSequences[i].bRedundantDirect |= (iPosDirect != string::npos);
					AllSequences[i].bRedundantOpposite |= ( iPosOpposite != string::npos);
					AllSequences[j].bRedundantDirect |= (iPosDirect != string::npos);
					AllSequences[j].bRedundantOpposite |= ( iPosOpposite != string::npos);
					AllSequences[i].bHeadsMatch = (iPosDirect == 0) || (iPosOpposite == 0);
					AllSequences[j].bHeadsMatch = AllSequences[i].bHeadsMatch;
					//break; //exhaustive search if commented
				}
			}
		}
	}
	int i_RD, i_RO, i_HM;
	i_RD = 0; i_RO = 0; i_HM = 0;
	for (vector<Seq454>::const_iterator S = AllSequences.begin(); S != AllSequences.end(); S++)
	{
		i_RD += S->bRedundantDirect;
		i_RO += S->bRedundantOpposite;
		i_HM += S->bHeadsMatch;
	}

	cout <<"\nRedundant_Direct\tRedundant_Opposite\tHeads_Match\tTotal\n"<< i_RD << '\t' << i_RO <<'\t' << i_HM << '\t'<<AllSequences.size();
	return 0;
}

