# This script runs the analyses underlying Section 3.2, 3.3, 3.4 in the paper

# Set working directory
setwd("D:/");

# Loading functions
source("code/effSpread_functions.R");
load("temp/pseudotrades");

# Required packages
require(data.table);   # The script uses data.table, which is much faster than data.frames. 
require(lfe);          # Used for regressions with fixed effects and clustered standard errors


###################################
# Table 1: Descriptive statistics #
###################################

# Stock-level metrics
liqStock = aggregate(1:nrow(tt), by = list(tt$Stock), spreadBias)
liqStock = cbind(as.data.frame(liqStock[,1]), liqStock[,2]);
dimnames(liqStock)[[2]] = c("Stock","midSpr","wmdSpr","micSpr",
                            "wmd_nomBias","wmd_bias","wmd_biasL","wmd_biasU",
                            "mic_nomBias","mic_bias","mic_biasL","mic_biasU",
                            "qSpr","medianQSpr","averagePrice","volTrades","volDollar");

save(list = c("liqStock"), file = "temp/liqStock");

# Panel A: Summary stats
Table1 = rbind(distributionStats(liqStock$midSpr,       w = liqStock$volDollar),
               distributionStats(liqStock$wmdSpr,       w = liqStock$volDollar),
               distributionStats(liqStock$micSpr,       w = liqStock$volDollar),
               distributionStats(liqStock$wmd_nomBias,  w = liqStock$volDollar),
               distributionStats(liqStock$mic_nomBias,  w = liqStock$volDollar),
               distributionStats(liqStock$qSpr,         w = liqStock$volDollar),
               distributionStats(liqStock$averagePrice, w = liqStock$volDollar),
               distributionStats(liqStock$volTrades,    w = rep(1, nrow(liqStock))),
               distributionStats(liqStock$volDollar,    w = rep(1, nrow(liqStock))));
dimnames(Table1) = list(c("midSpr", "wmdSpr", "micSpr", "wmdnomBias", "micnomBias",
                          "qSpr", "price", "trades", "dollarvolume"),
                        c("vwMean", "sd", "5%", "25%", "median", "75%", "95%"));
# Table 1(a)
capture.output(round(Table1[c("midSpr", "wmdSpr", "micSpr", "qSpr", "price", "trades", "dollarvolume"),], 2), 
               file = "output/Table1a.txt");
             
# Table 1(b)
# Panel B: Bias and statistical tests 
tt$wmdBias = tt$espr_mid - tt$espr_wmd;
tt$micBias = tt$espr_mid - tt$espr_mic;
micBiasTest = felm(micBias ~ 1 | 0 | 0 | Stock + Date + Venue, data = tt, weights = tt$USDvol);
wmdBiasTest = felm(wmdBias ~ 1 | 0 | 0 | Stock + Date + Venue, data = tt, weights = tt$USDvol);
tt[,c("wmdBias","micBias"):=NULL];

clusteredTstat = rbind(c(wmdBiasTest$coeff[1], wmdBiasTest$STATS$wmdBias$ctval),
                       c(micBiasTest$coeff[1], micBiasTest$STATS$micBias$ctval))
# Note: summary(micBiasTest) yeilds a warning message, saying that clustered F-test can't be implemented.
#       That is not a problem, the clustered t-test is still valid.

T1b = t(round(rbind(t(clusteredTstat),
                      Table1[c("wmdnomBias", "micnomBias"),"vwMean"] / Table1[c("wmdSpr", "micSpr"),"vwMean"]),4));
dimnames(T1b) = list(c("Weighted mipoint", "Micro-price"),c("Nominal bias", "t-stat", "Relative bias"));

capture.output(round(T1b, 2), file = "output/Table1b.txt");

# Numbers reported in the text of Section 3.3
mean(tt$Price > 195);
mean(tt$Price < 5);
mean(tt$Price > 195 | tt$Price < 5);
mean(tt$Price > 25  & tt$Price <= 125);
sum(tt$USDvol[tt$Price > 25 & tt$Price <= 125]) / sum(tt$USDvol);
weighted.mean(tt$espr_mid[tt$Price > 25 & tt$Price <= 125], w=tt$USDvol[tt$Price > 25 & tt$Price <= 125]);
weighted.mean(tt$espr_wmd[tt$Price > 25 & tt$Price <= 125], w=tt$USDvol[tt$Price > 25 & tt$Price <= 125]);
weighted.mean(tt$espr_mic[tt$Price > 25 & tt$Price <= 125], w=tt$USDvol[tt$Price > 25 & tt$Price <= 125]);
mean(tt$Price <= 115);
sum(tt$USDvol[tt$Price <= 115]) / sum(tt$USDvol);

# Clean-up
rm(T1b, clusteredTstat, micBiasTest, wmdBiasTest, Table1);

######################################
# FIG 2: BIAS BY STOCK / PRICE GROUP #
######################################

subsetStocks = tt$Price >= 5 & tt$Price < 195;
liqPricegroupBias = aggregate(which(subsetStocks), by=list(seq(10, 200, 10)[findInterval(tt$Price[subsetStocks],seq(5, 205, 10))]), spreadBias);
liqPricegroupBias = as.data.frame(cbind(liqPricegroupBias[,1], liqPricegroupBias[,2]));
dimnames(liqPricegroupBias)[[2]] = c("priceGroup","midSpr","wmdSpr","micSpr",
                                     "wmd_nomBias","wmd_bias","wmd_biasL","wmd_biasU",
                                     "mic_nomBias","mic_bias","mic_biasL","mic_biasU",
                                     "qSpr","medianQSpr","averagePrice","volTrades","volDollar");

# Figure 2(a)
pdf('output/Fig2a.pdf')
plot(  liqPricegroupBias$priceGroup, liqPricegroupBias$midSpr, type = "l", col = "blue",
       xlab = "Share price groups (USD)", ylab = "Effective spread (bps)");
points(liqPricegroupBias$priceGroup, liqPricegroupBias$wmdSpr, type = "l", col = "black");
points(liqPricegroupBias$priceGroup, liqPricegroupBias$micSpr, type = "l", col = "red");
legend("top", c("Midpoint","Weighted midpoint","Micro-price"), fill = c("blue","black","red"));
dev.off()

pdf('output/Fig2a_volumes.pdf')
barplot(liqPricegroupBias$volDollar, main = "Trading volume across price groups", names.arg = liqPricegroupBias$priceGroup, 
        xlab = "Share price groups (USD)", ylab = "Volume (billion USD)");
dev.off()

# Figure 2(b): + for statistical signifiance
pdf('output/Fig2b.pdf')
plot(  liqPricegroupBias$priceGroup, liqPricegroupBias$wmd_nomBias / liqPricegroupBias$wmdSpr,type="l", ylim = c(0,0.2),
       xlab = "Share price groups (USD)", ylab = "Relative average bias");
points(liqPricegroupBias$priceGroup, liqPricegroupBias$mic_nomBias / liqPricegroupBias$micSpr,type="l", col="red");

points(liqPricegroupBias$priceGroup[ liqPricegroupBias$wmd_biasL > 0], liqPricegroupBias$wmd_nomBias[liqPricegroupBias$wmd_biasL > 0] / liqPricegroupBias$wmdSpr[liqPricegroupBias$wmd_biasL > 0], pch = 3);
points(liqPricegroupBias$priceGroup[ liqPricegroupBias$mic_biasL > 0], liqPricegroupBias$mic_nomBias[liqPricegroupBias$mic_biasL > 0] / liqPricegroupBias$micSpr[liqPricegroupBias$mic_biasL > 0], pch = 3, col="red");
legend("top", c("Weighted midpoint (+ for stat. sign.)","Micro-price (+ for stat. sign.)"), fill = c("black","red"));
dev.off()

# Figure 2(c)
pdf('output/Fig2c.pdf')
subset = liqStock$averagePrice < 195 & liqStock$medianQSpr == 0.01;
plot(  liqStock$averagePrice[subset], liqStock$wmd_bias[subset],pch=15, col="black", 
       ylim = c(min(liqStock$wmd_bias), max(liqStock$wmd_bias)), xlim = c(0,max(liqStock$averagePrice)),
       xlab = "Share price groups (USD)", ylab = "Relative average bias");
subset = liqStock$averagePrice < 195 & liqStock$medianQSpr == 0.02;
points(liqStock$averagePrice[subset], liqStock$wmd_bias[subset],pch=15, col="blue");
subset = liqStock$averagePrice < 195 & liqStock$medianQSpr >  0.02;
points(liqStock$averagePrice[subset], liqStock$wmd_bias[subset],pch=15, col="red");
legend("top", c("1 cent","2 cents", ">2 cents"), fill = c("black","blue","red"), title="Median quoted spread:");
dev.off()

# Clean-up
rm(liqPricegroupBias, subset, subsetStocks);

########################
# FIG 3: BIAS BY VENUE #
########################

liqVenue = aggregate(1:nrow(tt), by=list(tt$Venue), spreadBias);
liqVenue = cbind(as.data.frame(liqVenue[,1]), liqVenue[,2])
dimnames(liqVenue)[[2]] = c("Venue","midSpr","wmdSpr","micSpr",
                            "wmd_nomBias","wmd_bias","wmd_biasL","wmd_biasU",
                            "mic_nomBias","mic_bias","mic_biasL","mic_biasU",
                            "qSpr","medianQSpr","averagePrice","volTrades","volDollar");
Fig3 = round(liqVenue[,c("wmd_bias", "mic_bias")], 3);
dimnames(Fig3)[[1]] = liqVenue[,1];

pdf('output/Fig3.pdf')
barplot(t(Fig3),beside=T, main = "Bias across venues", xlab = "Venues", col = c("black","red"), ylab = "Relative average bias");
legend("topleft", c("Weighted midpoint","Micro-price"), fill = c("black","red"));
dev.off()

# Clean-up
rm(liqVenue, Fig3);

# I do not provide scripts for the 20-year sample in Section 3.5, as that amounts to no more
# than comparing the midpoint effective spread to the weighted midpoint effective spread.
# See script "1_build_trade_data.R" for details on those measures.
