/*BEGIN_LEGAL
 *  This is an example of the PIN tool that demonstrates some basic PIN APIs
 *  and could serve as the starting point for developing your first PIN tool
 */
#include "pin.H"
#include <iostream>
#include <fstream>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <map>
#include <sys/time.h>
#include <sys/resource.h>
#include "basictypes.h"
/* ================================================================== */
// Global variables
/* ================================================================== */

extern FILE *raceFile;

UINT64 insCount = 0;        //number of dynamically executed instructions
UINT64 bblCount = 0;        //number of dynamically executed basic blocks
UINT64 threadCount = 0;     //total number of threads, including main thread

static TLS_KEY tls_key; //thread local storage.

map<pthread_t, ThreadState*> pthread_threadstate_map;
map<pthread_mutex_t*, LockState*> pthread_lockstate_map;
map<pthread_barrier_t*, BarrierState*> pthread_barrierstate_map;

int parent_thread_id = -1;
int child_thread_id = -1;
PIN_LOCK thread_create_lock;
PIN_LOCK barrier_lock;
pthread_mutex_t thread_create_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t thread_create_cond = PTHREAD_COND_INITIALIZER;

//run_time = run_time_end - run_time_begin.
static double run_time_begin;
static double run_time_end;
static double run_time;

int varNum = 0;
int varRead = 0;
int varRead2 = 0;
int varWrite = 0;

int varCPre=0, varCPost=0, varJoin=0, varLock=0, varUnlock=0, varDes=0;
int varWPre=0, varWPost=0, varBInit=0, varBPre=0, varBPost=0, varBDes=0;
/* ===================================================================== */
// Command line switches
/* ===================================================================== */
KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE,  "pintool",
    "o", "", "specify file name for MyPinTool output");

KNOB<BOOL>   KnobCount(KNOB_MODE_WRITEONCE,  "pintool",
    "count", "1", "count instructions, basic blocks and threads in the application");


ThreadState* get_tls(THREADID threadid) {
	ThreadState* tData = static_cast<ThreadState*> (PIN_GetThreadData(tls_key, threadid));
	return tData;
}

/* ===================================================================== */
// Analysis routines
/* ===================================================================== */

void CountBbl(UINT32 numInstInBbl)
{
    bblCount++;
    insCount += numInstInBbl;
}
void callback_pthread_create_pre(THREADID threadid){
	GetLock(&thread_create_lock, threadid+1);
	parent_thread_id = threadid;
	varCPre++;
}
void callback_pthread_create_post(THREADID threadid, pthread_t* thread){
    varCPost++;
    while (parent_thread_id != -1)
		pthread_cond_wait(&thread_create_cond, &thread_create_mutex);
	//When reach here, means the created thread has started execution.
	//child_thread_id has been set.
	if (child_thread_id == -1) {
		printf("Error: new created thread missing id.\n");
		exit(0);
	}
	pthread_threadstate_map[*thread] = get_tls((THREADID)child_thread_id);
	child_thread_id = -1;
	ReleaseLock(&thread_create_lock);
}
void callback_threadstart(THREADID threadid, CONTEXT *ctxt, INT32 flags, VOID *v)
{
    threadCount++;

    if(threadid == 0)
    {
    	ThreadState *tMain = new ThreadState();
    	tMain->tid = threadid;
    	tMain->epoch = epoch_new(threadid, EPOCH_INIT_CLOCK); //thread id = 0, time = 1;
    	tMain->vc_C = vc_new();
    	tMain->vc_C[threadid] = tMain->epoch;
    	PIN_SetThreadData(tls_key, tMain, threadid);
    	return;
    }

    //if other threads.
    pthread_mutex_lock(&thread_create_mutex);
    if(parent_thread_id == -1) {
    	printf("Error: When creating, parent thread missing id.\n");
    	exit(0);
    }

    //This starting thread is created by a parent thread.
    ThreadState* child = new ThreadState();
    child->tid = threadid;
    child->epoch = epoch_new(threadid, EPOCH_INIT_CLOCK);
    child->vc_C = vc_new();
    child->vc_C[threadid] = child->epoch;

    ThreadState* parent = get_tls((THREADID)parent_thread_id);
    vc_MaxToFirst(child->vc_C, parent->vc_C);
    parent->epoch = epoch_inc(parent->epoch);
    parent->vc_C[parent->tid] = parent->epoch;
    PIN_SetThreadData(tls_key, child, threadid);

    if (child_thread_id != -1) {
    	printf("[Error: created child ID is not set.]\n");
    	exit(0);
    }

    child_thread_id = threadid;
    parent_thread_id = -1;

    pthread_cond_signal(&thread_create_cond);
    pthread_mutex_unlock(&thread_create_mutex);
}

void callback_pthread_join(THREADID threadid, pthread_t thread){
	varJoin++;
	map<pthread_t, ThreadState*>::iterator it;
	it = pthread_threadstate_map.find(thread);
	if (it == pthread_threadstate_map.end()) {
		printf("Error: When join, the thread u state is missing. \n");
		exit(0);
	}
	ThreadState* t = get_tls(threadid);
	ThreadState* u = it->second;
	vc_MaxToFirst(t->vc_C, u->vc_C);
	u->epoch = epoch_inc(u->epoch);
	u->vc_C[u->tid] = u->epoch;
}

void PIN_FAST_ANALYSIS_CALL callback_pthread_lock(THREADID threadid, pthread_mutex_t* mutex) {
	varLock++;
	LockState* lock = 0;
	map<pthread_mutex_t*, LockState*>::iterator it;
	it = pthread_lockstate_map.find(mutex);
	if (it == pthread_lockstate_map.end()) {
		lock = new LockState();
		lock->vc_L = vc_new();
		pthread_lockstate_map[mutex] = lock;
	} else {
		lock = it->second;
	}
	ThreadState* t = get_tls(threadid);
	vc_MaxToFirst(t->vc_C, lock->vc_L);
}
void PIN_FAST_ANALYSIS_CALL callback_pthread_unlock(THREADID threadid, pthread_mutex_t* mutex) {
	varUnlock++;
	LockState* lock = 0;
	map<pthread_mutex_t*, LockState*>::iterator it;
	it = pthread_lockstate_map.find(mutex);
	if (it == pthread_lockstate_map.end()) {
		lock = new LockState();
		lock->vc_L = vc_new();
		pthread_lockstate_map[mutex] = lock;
	} else {
		lock = it->second;
	}
	ThreadState* t = get_tls(threadid);
	vc_MaxToFirst(lock->vc_L, t->vc_C);
	t->epoch = epoch_inc(t->epoch);
	t->vc_C[t->tid] = t->epoch;
}
void PIN_FAST_ANALYSIS_CALL callback_pthread_mutex_destroy (THREADID threadid, pthread_mutex_t* mutex) {
	varDes++;
	map<pthread_mutex_t*, LockState*>::iterator it;
	it = pthread_lockstate_map.find(mutex);
	if (it != pthread_lockstate_map.end()) {
		pthread_lockstate_map.erase(mutex);
	}
}

void callback_pthread_cond_wait_pre (THREADID threadid, pthread_mutex_t* mutex) {
	varWPre++;
	callback_pthread_unlock(threadid, mutex);
}
void callback_pthread_cond_wait_post (THREADID threadid, pthread_mutex_t* mutex) {
	varWPost++;
	callback_pthread_lock(threadid, mutex);
}

void callback_pthread_barrier_init (THREADID threadid, pthread_barrier_t *b) {
	varBInit++;
	map<pthread_barrier_t*, BarrierState*>::iterator it;
	it = pthread_barrierstate_map.find(b);
	if (it == pthread_barrierstate_map.end()) {
		BarrierState* barrier = new BarrierState();
		barrier->vc_B = vc_new();
		pthread_barrierstate_map[b] = barrier;
	}
}

void callback_pthread_barrier_wait_pre (THREADID threadid, pthread_barrier_t *b) {
	varBPre++;
	BarrierState* barrier = 0;
	map<pthread_barrier_t*, BarrierState*>::iterator it;
	it = pthread_barrierstate_map.find(b);
	if (it == pthread_barrierstate_map.end()) {
		barrier = new BarrierState();
		barrier->vc_B = vc_new();
		pthread_barrierstate_map[b] = barrier;
	} else {
		barrier = it->second;
	}
	ThreadState* t = get_tls(threadid);
	GetLock(&barrier_lock, (unsigned int)b);
	vc_MaxToFirst(barrier->vc_B, t->vc_C);
	ReleaseLock(&barrier_lock);
}

void callback_pthread_barrier_wait_post (THREADID threadid, pthread_barrier_t *b) {
	varBPost++;
	BarrierState* barrier = 0;
	map<pthread_barrier_t*, BarrierState*>::iterator it;
	it = pthread_barrierstate_map.find(b);
	if (it == pthread_barrierstate_map.end()) {
		barrier = new BarrierState();
		barrier->vc_B = vc_new();
		pthread_barrierstate_map[b] = barrier;
	} else {
		barrier = it->second;
	}
	ThreadState* t = get_tls(threadid);
	GetLock(&barrier_lock, (unsigned int)b);
	vc_MaxToFirst(t->vc_C, barrier->vc_B);
	t->epoch = epoch_inc(t->epoch);
	t->vc_C[t->tid] = t->epoch;
	ReleaseLock(&barrier_lock);
}

void PIN_FAST_ANALYSIS_CALL callback_pthread_barrier_destroy (THREADID threadid, pthread_barrier_t*  b) {
	varBDes++;
	map<pthread_barrier_t*, BarrierState*>::iterator it;
	it = pthread_barrierstate_map.find(b);
	if (it != pthread_barrierstate_map.end()) {
		pthread_barrierstate_map.erase(b);
	}
}

void PIN_FAST_ANALYSIS_CALL callback_memory_read(THREADID threadid, ADDRINT instAddr, ADDRINT memoryAddr, UINT32 memorySize)
{
	ThreadState* t = get_tls(threadid);
	VarState* var = NULL;
	if (getSM_IsNewVar(memoryAddr, &var)) {
		//New Shadow memory.
		var->r = t->epoch;
		var->w = 0; //Main thread epoch starts as 1. So 0 means no thread access it.
		var->vc_R = NULL;
		var->lastINS_R = instAddr;
		var->race_reported = false;
		var->isReadShared = false;
		return;
	}

	//The following is the FastTrack algorithm on read access
	//Same epoch?
	if (var->r == t->epoch) {
		return; //Not the first-read of this thread in the same time frame (two-release).
	}

	//Check write-read race
	if (!var->race_reported && var->w > t->vc_C[TID(var->w)]) {
		report_race((char*)"w-r", TID(var->w), var->lastINS_W, var->w,
				threadid, instAddr, t->vc_C[TID(var->w)], memoryAddr);
        	var->race_reported=true;
	}

	//Update read state
	if (var->isReadShared) {					//Read-shared
		var->vc_R[t->tid] = t->epoch;
		var->lastINS_VCR[t->tid] = instAddr;
	} else { 									//Exclusive
		if (var->r <= t->vc_C[TID(var->r)]) {
			var->r = t->epoch;
			var->lastINS_R = instAddr;
		}
		else { 									//Share
			if (var->vc_R == NULL) {
				var->vc_R = vc_new();
				var->lastINS_VCR = vc_ins_new();

			}
			var->vc_R[TID(var->r)] = var->r;
			var->lastINS_VCR[TID(var->r)] = var->lastINS_R;
			var->vc_R[t->tid] = t->epoch;
			var->lastINS_VCR[t->tid] = instAddr;
			var->isReadShared = true;
		}
	}
}
void PIN_FAST_ANALYSIS_CALL callback_memory_write(THREADID threadid, ADDRINT instAddr, ADDRINT memoryAddr, UINT32 memorySize)
{
	ThreadState* t = get_tls(threadid);
	VarState* var = NULL;
	if (getSM_IsNewVar(memoryAddr, &var)) {
		//New Shadow memory.
		var->w = t->epoch;
		var->r = 0; //Main thread epoch starts as 1. So 0 means no thread access it.
		var->vc_R = NULL;
		var->lastINS_W = instAddr;
		var->race_reported = false;
		var->isReadShared = false;
		return;
	}

	//The following is the FastTrack algorithm of write access.

	// Same write epoch?
	if (var->w == t->epoch)
		return;

	//Check write-write race
	if (!var->race_reported && var->w > t->vc_C[TID(var->w)]) {
		report_race((char*)"w-w", TID(var->w), var->lastINS_W, var->w,
						threadid, instAddr, t->vc_C[TID(var->w)], memoryAddr);
        	var->race_reported=true;
	}
	//Check read-write race
	if (!var->isReadShared) {
		if (!var->race_reported && var->r > t->vc_C[TID(var->r)]) {
			report_race((char*)"r-w", TID(var->r), var->lastINS_R, var->r,
								threadid, instAddr, t->vc_C[TID(var->r)], memoryAddr);
            		var->race_reported=true;
		}
	} else {
		for (int i = 0; i < VC_SIZE; i++) {
			if (!var->race_reported && var->vc_R[i] > t->vc_C[i]) {
				report_race((char*)"r-w", TID(var->vc_R[i]), var->lastINS_VCR[i], var->vc_R[i],
								threadid, instAddr, t->vc_C[TID(var->vc_R[i])], memoryAddr);
                		var->race_reported=true;
			}
		}
	}

	// update write state
	var->w = t->epoch;
	var->lastINS_W = instAddr;
}

void callback_free(void *ptr)
{
	if(ptr==0) return;
	deleteSMKey(ptr);
}
/* ===================================================================== */
// Instrumentation callbacks
/* ===================================================================== */

VOID InstrumentPthreadRTN(RTN rtn)
{
	string rtn_name = RTN_Name(rtn);
	if (rtn_name.find("pthread_create") != string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
					(AFUNPTR) callback_pthread_create_pre,
					IARG_THREAD_ID,
					IARG_END);
		RTN_InsertCall(rtn, IPOINT_AFTER,
					(AFUNPTR) callback_pthread_create_post,
					IARG_THREAD_ID,
					IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
					IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_join") != string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_join,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_mutex_lock") != string::npos ||
			rtn_name.find("pthread_mutex_trylock") != string::npos ||
			rtn_name.find("pthread_mutex_timedlock") != string::npos ||
			rtn_name.find("pthread_spin_lock") != string::npos ||
			rtn_name.find("pthread_spin_trylock") != string::npos ||
			rtn_name.find("pthread_rwlock_wrlock") !=string::npos ||
			rtn_name.find("pthread_rwlock_trywrlock") !=string::npos ||
			rtn_name.find("pthread_rwlock_rdlock") !=string::npos ||
			rtn_name.find("pthread_rwlock_tryrdlock" ) !=string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_lock,
				IARG_FAST_ANALYSIS_CALL,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_mutex_unlock") != string::npos ||
			rtn_name.find("pthread_spin_unlock") != string::npos ||
			rtn_name.find("pthread_rwlock_unlock") !=string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_unlock,
				IARG_FAST_ANALYSIS_CALL,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_Close(rtn);
	} else if(rtn_name.find("pthread_mutex_destroy")!=string::npos ||
			rtn_name.find("pthread_spin_destroy") !=string::npos||
			rtn_name.find("pthread_rwlock_destroy") !=string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
						(AFUNPTR) callback_pthread_mutex_destroy,
						IARG_FAST_ANALYSIS_CALL,
						IARG_THREAD_ID,
						IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
						IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_cond_wait") != string::npos ||
			rtn_name.find("pthread_cond_timedwait") != string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_cond_wait_pre,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 1,
				IARG_END);

		RTN_InsertCall(rtn, IPOINT_AFTER,
				(AFUNPTR) callback_pthread_cond_wait_post,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 1,
				IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_barrier_init") != string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_barrier_init,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_barrier_wait") != string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
				(AFUNPTR) callback_pthread_barrier_wait_pre,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_InsertCall(rtn, IPOINT_AFTER,
				(AFUNPTR) callback_pthread_barrier_wait_post,
				IARG_THREAD_ID,
				IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
				IARG_END);
		RTN_Close(rtn);
	} else if (rtn_name.find("pthread_barrier_destroy")!= string::npos) {
		RTN_Open(rtn);
		RTN_InsertCall(rtn, IPOINT_BEFORE,
						(AFUNPTR) callback_pthread_barrier_destroy,
						IARG_FAST_ANALYSIS_CALL,
						IARG_THREAD_ID,
						IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
						IARG_END);
		RTN_Close(rtn);
	}
}
VOID InstrumentFreeRTN(RTN rtn)
{
	string rtn_name = RTN_Name(rtn);
	if (rtn_name.find("free") != string::npos || rtn_name.find("cfree") != string::npos) {
		RTN_Open(rtn);
        	RTN_InsertCall(rtn, IPOINT_BEFORE,
        		(AFUNPTR) callback_free,
                	IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
                	IARG_END);
        	RTN_Close(rtn);
	}
}
VOID ImageLoad(IMG img, VOID *v)
{
    std::cout << "[Load Image] " << IMG_Name(img) << std::endl;
    //instrument images with pthread primitives
    if (IMG_Name(img).find("libpthread") != string::npos)
    {
    	for (SEC sec = IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec))
    		for (RTN rtn = SEC_RtnHead(sec); RTN_Valid(rtn); rtn = RTN_Next(rtn))
    		{
    			InstrumentPthreadRTN(rtn);
    		}
    }
    else if (IMG_Name(img).find("libc") != string::npos)
    {
    	for (SEC sec = IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec))
    		for (RTN rtn = SEC_RtnHead(sec); RTN_Valid(rtn); rtn = RTN_Next(rtn))
    		{
    			InstrumentFreeRTN(rtn);
    		}
    }
}

VOID Trace(TRACE trace, VOID *v)
{
	RTN rtn = TRACE_Rtn(trace);
	if (RTN_Valid(rtn)) {
		//if(IsTraceIgnoreRoutine(rtn))
		//	return;
		SEC sec;
		IMG img;
		sec = RTN_Sec(rtn);
		img = SEC_Img(sec);
		if (IsTraceIgnoreImage(img))
			return;
	} else {
		return;
	}

    // Visit every basic block in the trace
    for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl))
    {
        // Insert a call to CountBbl() before every basic bloc, passing the number of instructions
        BBL_InsertCall(bbl, IPOINT_BEFORE, (AFUNPTR)CountBbl, IARG_UINT32, BBL_NumIns(bbl), IARG_END);
        for(INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins=INS_Next(ins)) {

        	/*
        	 * we take part code from ThreadSantizer: InstrumentMopsInBBL.
        	 * http://code.google.com/p/data-race-test/
        	 * */

        	if(INS_IsAtomicUpdate(ins))continue;
        	if(INS_MemoryOperandCount(ins) == 0)continue;

        	//For each access, return: threadid, instruction address, access address, and the size of accessed memory.
        	//memory read access
        	if(INS_IsMemoryRead(ins)) {
        		INS_InsertPredicatedCall(ins, IPOINT_BEFORE,
        				(AFUNPTR) callback_memory_read,
        				IARG_FAST_ANALYSIS_CALL,
        				IARG_THREAD_ID,
        				IARG_INST_PTR,
        				IARG_MEMORYREAD_EA,
        				IARG_MEMORYREAD_SIZE,
        				IARG_END);
        		varRead++;
        	}
        	//memory read access 2
        	if (INS_HasMemoryRead2(ins)) {
        		INS_InsertPredicatedCall(ins, IPOINT_BEFORE,
        				(AFUNPTR) callback_memory_read,
        				IARG_FAST_ANALYSIS_CALL,
        				IARG_THREAD_ID,
        				IARG_INST_PTR,
        				IARG_MEMORYREAD2_EA,
        				IARG_MEMORYREAD_SIZE,
        				IARG_END);
        		varRead2++;
        	}
        	//memory write access
        	if(INS_IsMemoryWrite(ins)) {
        		INS_InsertPredicatedCall(ins, IPOINT_BEFORE,
        				(AFUNPTR) callback_memory_write,
        				IARG_FAST_ANALYSIS_CALL,
        				IARG_THREAD_ID,
        				IARG_INST_PTR,
        				IARG_MEMORYWRITE_EA,
        				IARG_MEMORYWRITE_SIZE,
        				IARG_END);
        		varWrite++;
        	}
        }
    }
}

/*!
 * Print out analysis results.
 * This function is called when the application exits.
 * @param[in]   code            exit code of the application
 * @param[in]   v               value specified by the tool in the
 *                              PIN_AddFiniFunction function call
 */
VOID Fini(INT32 code, VOID *v)
{
	run_time_end = clock();
	run_time = (run_time_end - run_time_begin)/CLOCKS_PER_SEC;

    std::ostream * out = &cerr;
    string fileName = KnobOutputFile.Value();

    fclose(raceFile);
    deletePMSM();
    deleteStateMap();
    if (!fileName.empty()) { out = new std::ofstream(fileName.c_str());}

    *out <<  "===============================================" << endl;
    *out <<  "MyPinTool analysis results: " << endl;
    *out <<  "Number of instructions: " << insCount  << endl;
    *out <<  "Number of basic blocks: " << bblCount  << endl;
    *out <<  "Number of threads: " << threadCount  << endl;
    *out <<  "Number of races: " << raceNum  << endl;
    *out <<  "Number of memory locations: " << varNum << endl;
    *out <<  "Number of memory reads, reads2 & writes: " << varRead << ", " << varRead2 << ", " << varWrite << endl;
    *out <<  "Run-time (in seconds): " << run_time << endl;
    *out <<  "===============================================" << endl;
    *out <<  "Creat_Pre, Creata_Post, Start, Join: " << varCPre << ", " << varCPost << ", " << threadCount << ", "<< varJoin << endl;
    *out <<  "Lock, Unlock, LockDestroy: " << varLock << ", " << varUnlock << ", " << varDes << endl;
    *out <<  "Wait_Pre, Wait_Post: " << varWPre << ", " << varWPost << endl;
    *out <<  "BInit, BPre, BPost, BDes: " << varBInit << ", " << varBPre << ", " << varBPost << ", " << varBDes << endl;
    *out <<  "===============================================" << endl;
}

/*!
 * The main procedure of the tool.
 * This function is called when the application image is loaded but not yet started.
 * @param[in]   argc            total number of elements in the argv array
 * @param[in]   argv            array of command line arguments,
 *                              including pin -t <toolname> -- ...
 */
int main(int argc, char *argv[])
{
    // Initialize PIN library. Print help message if -h(elp) is specified
    // in the command line or the command line is invalid
    PIN_Init(argc,argv);
    PIN_InitSymbols();//Function symbols.
    InitLock(&thread_create_lock);//Initialize the pin lock.
    InitLock(&barrier_lock);//Initialize the pin lock.

    //pin -t pintool argv[6] -- App.
    std::string raceFileName = argv[5];
    const char* raceFileNameStr = raceFileName.c_str();
    raceFile = fopen(raceFileNameStr,"w");

    //Initial thread local storage key and shadow memory PM.
    tls_key = PIN_CreateThreadDataKey(0);
    memset(PM, 0, 65536*sizeof(SM));
    
    if (KnobCount)
    {
    	// Register function to be called to instrument images
    	IMG_AddInstrumentFunction(ImageLoad, 0);

        // Register function to be called to instrument traces
        TRACE_AddInstrumentFunction(Trace, 0);

        // Register function to be called for every thread before it starts running
        PIN_AddThreadStartFunction(callback_threadstart, 0);

        // Register function to be called when the application exits
        PIN_AddFiniFunction(Fini, 0);
    }

    cerr <<  "===============================================" << endl;
    cerr <<  "This application is instrumented by FastTrack" << endl;
    if (!KnobOutputFile.Value().empty())
    {
        cerr << "See file " << KnobOutputFile.Value() << " for analysis results" << endl;
    }
    cerr <<  "===============================================" << endl;

    run_time_begin = clock();
    // Start the program, never returns
    PIN_StartProgram();
    return 0;
}
