// Built by Peter A Noble April 2020 Email: panoble2017@gmail.com
// Copyright 2020

#include <fstream>
#include <string>
#include <iostream>
#include <math.h>
#include <cstdlib>
#include <float.h>
#include <complex>
#include <iomanip>
#include <ctype.h>

// g++ extract_target_words.cpp -o extract_target_words
// ./extract_target_words target.txt electronic_health_records.txt extracted_target_words_out.txt
// ./extract_target_words target.txt one_record.txt extracted_target_words_out.txt
// ./extract_target_words target.txt 505.txt extracted_target_words_out.txt
// ./extract_target_words target.txt missing.txt extracted_missing_target_words_out.txt
// ./extract_target_words target.txt all2.txt extracted_target_words_out.txt
// ./extract_target_words target.txt bp.txt extracted_target_words_bp_out.txt
// ./extract_target_words target.txt 1_record.txt extracted_1_record_out.txt


// Purpose is to identify specific words in a record as well as the words surrounding the target

using namespace std;

int  clean(char * str, char * str1);
int clean2(char * str, char * str1);

int clean2(char * str, char * str1) // purpose it to rm junk from unstructured text
{
int i,j=0;
char c;
//char str1[100];
j=strlen(str); 

//cout << str << "\t" << str1 << "\t" << flush; exit(1);
for (i=0;i<(j);i++)
	{
	if (str[i]=='A') {str[i]='a';}
	if (str[i]=='B') {str[i]='b';}
	if (str[i]=='C') {str[i]='c';}
	if (str[i]=='D') {str[i]='d';}
	if (str[i]=='E') {str[i]='e';}
	if (str[i]=='F') {str[i]='f';}
	if (str[i]=='G') {str[i]='g';}
	if (str[i]=='H') {str[i]='h';}
	if (str[i]=='I') {str[i]='i';}
	if (str[i]=='J') {str[i]='j';}
	if (str[i]=='K') {str[i]='k';}
	if (str[i]=='L') {str[i]='l';}
	if (str[i]=='M') {str[i]='m';}
	if (str[i]=='N') {str[i]='n';}
	if (str[i]=='O') {str[i]='o';}
	if (str[i]=='P') {str[i]='p';}
	if (str[i]=='Q') {str[i]='q';}
	if (str[i]=='R') {str[i]='r';}
	if (str[i]=='S') {str[i]='s';}
	if (str[i]=='T') {str[i]='t';}
	if (str[i]=='U') {str[i]='u';}
	if (str[i]=='V') {str[i]='v';}
	if (str[i]=='W') {str[i]='w';}
	if (str[i]=='X') {str[i]='x';}
	if (str[i]=='Y') {str[i]='y';}
	if (str[i]=='Z') {str[i]='z';}
	if (str[i]=='=') {str[i]='_';}
	if (str[i]=='*') {str[i]=' ';}
	if (str[i]=='(') {str[i]=' ';}
	if (str[i]==')') {str[i]=' ';}
	if (str[i]=='-') {str[i]=' ';}
	if (str[i]=='.') {str[i]=' ';}
	if (str[i]=='0') {str[i]='0';}
	if (str[i]=='1') {str[i]='1';}
	if (str[i]=='2') {str[i]='2';}
	if (str[i]=='3') {str[i]='3';}
	if (str[i]=='4') {str[i]='4';}
	if (str[i]=='5') {str[i]='5';}
	if (str[i]=='6') {str[i]='6';}
	if (str[i]=='7') {str[i]='7';}
	if (str[i]=='8') {str[i]='8';}
	if (str[i]=='9') {str[i]='9';}
	if (str[i]=='<') {str[i]=' ';}
	if (str[i]=='>') {str[i]=' ';}
	if (str[i]=='/') {str[i]=' ';}
	if (str[i]==':') {str[i]=' ';}
	if (str[i]==';') {str[i]=' ';}
	if (str[i]=='.') {str[i]='\0';}
	if (str[i]==',') {str[i]='\0';}
	}
strcpy(str1,str);
return 0; 
}

int clean(char * str, char * str1) // purpose it to rm junk from unstructured text
{
int length=strlen(str);
int i,j=0;
	
for (i=0;i<(length);i++)
	{
	if (str[i]=='=') {str[i]='_';}
	if (str[i]=='"') {str[i]=' ';}
	if (str[i]=='D') {str[i]='d';}
	if (str[i]=='I') {str[i]='i';}
	if (str[i]=='O') {str[i]='o';}
	if (str[i]=='o') {str[i]='o';}
	if (str[i]=='B') {str[i]='b';}
	if (str[i]=='.') {str[i]=' ';}
	if (str[i]=='0') {str[i]='0';}
	if (str[i]=='1') {str[i]='1';}
	if (str[i]=='2') {str[i]='2';}
	if (str[i]=='3') {str[i]='3';}
	if (str[i]=='4') {str[i]='4';}
	if (str[i]=='5') {str[i]='5';}
	if (str[i]=='6') {str[i]='6';}
	if (str[i]=='7') {str[i]='7';}
	if (str[i]=='8') {str[i]='8';}
	if (str[i]=='9') {str[i]='9';}
	if (str[i]=='<') {str[i]=' ';}
	if (str[i]=='>') {str[i]=' ';}
	if (str[i]=='.') {str[i]=' ';}
	if (str[i]==',') {str[i]=' ';}
	if (str[i]=='R') {str[i]=' ';}
	if (str[i]=='d') {str[i]='d';}
	//if (str[i]=='\0') {str1[j]='\0';}

	}
	strcpy(str1,str);

return 0; //or return 1 if there is some error
}

int main (int argc, char * const argv[]) {
	ifstream in(argv[1]); 	
	ifstream in2(argv[2]); 	
	ofstream out(argv[3]); 	
//	ofstream outfile; 	

int num=1000000;
int standard=100;
int num2=100;

// array declare
char** holder = new char*[num];
for (int s = 0; s < num; s++)
	{
	holder[s] = new char[standard];
	strcpy(holder[s],"test");
	}

char** target = new char*[num2];
for (int s = 0; s < num2; s++)
	{
	target[s] = new char[standard];
	strcpy(holder[s],"test");
	}

// declare variables
int length=0;
string R;
char* S = new char [R.length() +1];
int flag=0;
char word[100];
char word2[100];
int check=0;
int tracker=0;
int stop=0;
int stop2=0;
char cleanWrd[100];
char cleanWrd2[100];
int i=0;
int temp_status=0;

// headers
// out << "Record ID\t" << "Target found\t" << "Sentence\n";
// cout << "Record ID\t" << "Target found\t" << "Sentence\n";

while(!in.eof())
	{
	in >> target[i];	
	i=i+1;	
	}
/*
for (int j = 0; j <= i; j++) // does the word match any in the target file?
	{
	cout << target[j] << "\n" <<flush;
	}
exit(1);
*/
	
while(!in2.eof())
	{
 	//temp_status=0;
	in2 >> word; //clean2(word, word2); 
	
	for (int j = 0; j <= i; j++) // does the word match any in the target file?
		{
		if (strcmp(target[j],word)==0)
			{
			check=1;
			}
		}
		
	if (check==0) // save the previous words if the target word has not been found 
		{
		strcpy(word2,word);
		clean2(word2, cleanWrd2);
		strcpy(holder[tracker],cleanWrd2); 
		tracker=tracker+1; 
		}

	if ((check==1) && (stop==0)) // if target word found then print out the previous words
	//if ((check==1)) // if target word found then print out the previous words
		{
		cout <<  holder[tracker-9] << "\t" << holder[tracker-8] << "\t" << holder[tracker-7] << "\t" << holder[tracker-6]  << "\t"  << holder[tracker-5] << "\t" << holder[tracker-4] << "\t" << holder[tracker-3]<< "\t" << holder[tracker-2] << "\t" << holder[tracker-1] << "\t";
		out  << holder[tracker-9] << "\t" << holder[tracker-8] << "\t" << holder[tracker-7] << "\t" << holder[tracker-6]  << "\t"  << holder[tracker-5] << "\t" << holder[tracker-4] << "\t" << holder[tracker-3]<< "\t" << holder[tracker-2] << "\t" << holder[tracker-1] << "\t" ;
		tracker=0; 
		stop=1; 
		//check=0;
		}

	if ((check==1) && (stop==1) && (stop2<=10)) 
		{ 
		stop2=stop2+1; 
		clean2(word, cleanWrd2); 
		cout << cleanWrd2 << "\t"; 
		out << cleanWrd2 << "\t";
		}

//	if (check==0) {temp_status=0;}
//<doc ob="598">
	if ((strcmp(word,"<record")==0) || (strcmp(word,"<doc")==0) || (strcmp(word,"<RECORD")==0) || (strcmp(word,"<Doc")==0))
		{ 
		cout << "\n";	  
		out << "\n";	  
		in2 >> cleanWrd; clean(cleanWrd,word2); 
		cout << word2 << "\t";
		out << word2 << "\t";
		length=strlen(word2);
		string S(word2,length);
		S += ".txt";
		tracker=0;check=0; stop=0; stop2=0; 

//		outfile.open(S.c_str(), ios::app);
		}
//		outfile << word << " "; 

	if ((strcmp(word,"</RECORD>")==0) || (strcmp(word,"</DOC>")==0) || (strcmp(word,"</record>")==0) || (strcmp(word,"</doc>")==0))
		{ 
//		outfile << "\n";	 //cout << status;
//		outfile.close(); 
			for (int s = 0; s <= 10; s++)
				{
				strcpy(holder[s]," ");
				}
		}
	}

cout << "\n";
out << "\n";
		
return 0;
}
