# This script builds the trade data set based on data downloaded from Tick History

# Set working directory
setwd("D:/");

# Loading functions
source("code/effSpread_functions.R");

# Required packages
require(data.table);   # The script uses data.table, which is much faster than data.frames. 
require(R.utils);      # Used when loading data in fread with colClasses
require(DescTools);    # Used for winsorization
require(ggplot2);      # Used for plots
require(RColorBrewer); # Used for colours in micro-price plots

##########################################
# 1. BUILD THE TRADE AND QUOTE DATA SETS #
##########################################

# The first week is used for pre-estimation of the micro-price adjustment function only
load(file = "input/pseudodata");
tq = tq[Date %in% 1:5];

# Extract the security identifiers 
stocks = unique(tq$Stock);

# Run the micro-price script for each security
sapply(stocks,tradequoteFilter_function,outfolder="temp/",silent=T,microprice.return=T);
rm(tq, stocks);

# Load data for the baseline sample, i.e., the second week
load(file = "input/pseudodata");
tq = tq[Date %in% 6:10];

# Extract the security identifiers
stocks = unique(tq$Stock);

# Run the trade filtering script for each security
tt = lapply(stocks,tradequoteFilter_function, outfolder="temp/",silent=F, load.microprice = T, microprice.folder = "temp/");
tt = do.call(rbind,tt);

# Direction of trade indicator
tt$D = sign(tt$Price - tt$mid);
tt$D[tt$D == 0 & !is.na(tt$D)] = c(0,1,-1)[match(tt$tickDir[tt$D == 0 & !is.na(tt$D)],c("","^","v"))];

# Removing trades with missing or zero direction of trade
tt = tt[tt$D != 0 & !is.na(tt$D)];

# Effective and quoted spreads winsorized at the stock level
winz = 0.01;
tt[, espr_mid := Winsorize(20000 * D * (Price - mid) / mid, probs = c(winz, 1-winz)), by = Stock]; # Midpoint effective spread
tt[, espr_wmd := Winsorize(20000 * D * (Price - wmd) / mid, probs = c(winz, 1-winz)), by = Stock]; # Weighted midpoint effective spread
tt[, espr_mic := Winsorize(20000 * D * (Price - mic) / mid, probs = c(winz, 1-winz)), by = Stock]; # Micro-price effective spread
tt[, qspr     := Winsorize(10000 * spr / mid,               probs = c(winz, 1-winz)), by = Stock]; # Quoted spread

# Price impact & Realized spread
tt[, pi10s_mid := 20000 * D * (mid10s  - mid) / mid];
tt[, pi10s_wmd := 20000 * D * (wmd10s  - wmd) / mid];
tt[, pi10s_mic := 20000 * D * (mic10s  - mic) / mid];
tt[, pi5mn_mid := 20000 * D * (mid5min - mid) / mid];
tt[, pi5mn_wmd := 20000 * D * (wmd5min - wmd) / mid];
tt[, pi5mn_mic := 20000 * D * (mic5min - mic) / mid];

tt[, rs10s_mid := 20000 * D * (Price - mid10s)  / mid];
tt[, rs10s_wmd := 20000 * D * (Price - wmd10s)  / mid];
tt[, rs10s_mic := 20000 * D * (Price - mic10s)  / mid];
tt[, rs5mn_mid := 20000 * D * (Price - mid5min) / mid];
tt[, rs5mn_wmd := 20000 * D * (Price - wmd5min) / mid];
tt[, rs5mn_mic := 20000 * D * (Price - mic5min) / mid];

# Dollar volume (millions)
tt$USDvol    = tt$Price*tt$Volume / 1e6;

# Run the quote filtering script for each security
qq = lapply(stocks,quoteFilter_function, outfolder="temp/",silent=F, load.microprice = T, microprice.folder = "temp/");
qq = do.call(rbind,qq);

# Saving data used in all subsequent scripts
save(list = c("tt"), file = "temp/pseudotrades");
save(list = c("qq"), file = "temp/pseudoquotes");

# Clean-up
rm(tq, stocks, winz);
