# This function filters trade and quote observations, matches trades to quotes,
# and obtains fundamental value estimators (midpoint, weighted midpoint, and micro-price) for each quote
tradequoteFilter_function = function(ticker, silent=TRUE, microprice.return=F, outfolder="", load.microprice = F, microprice.folder){
  # Quotes
  qq = tq[tq$Type=="Quote" & tq$Stock==ticker,c("Date","Quote.Time","Time","gmtOffset","BuyerID","Bid.Price","Bid.Size","SellerID","Ask.Price","Ask.Size","Qualifiers"), with=FALSE];
  
  # Removing quotes that are not the last in a given millisecond
  qq = qq[!duplicated(qq,by=c("Date","Quote.Time"),fromLast = TRUE)];
  
  # Converting time stamps to seconds past midnight
  time = strptime(qq$Quote.Time,format="%H:%M:%OS"); # Before 2006-10-23, for NYSE stocks, use qq$Time instead of qq$Quote.Time
  qq$Time = (time$hour + qq$gmtOffset)*3600 + time$min*60 + time$sec;
  qq[,c("Quote.Time","gmtOffset"):=NULL];
  
  # Fundamental value estimators 
  qq$mid = (qq$Bid.Price + qq$Ask.Price)/2; # Midpoint
  qq$spr = round(qq$Ask.Price - qq$Bid.Price,4); # Nominal spread. Important to not round to two digits for pre-decimalization sample
  qq$imb = qq$Bid.Size/(qq$Ask.Size + qq$Bid.Size); # Order book imbalance
  qq$wmd = qq$mid + qq$spr * (qq$imb - 0.5); # Weighted midpoint
  
  # Micro-price (following Stoikov, 2018)
  if(load.microprice) load(paste0(microprice.folder,ticker)) else {
    imbStates = 9; # Number of order book imbalance states
    freq = 1; # 1 implies 1 second sampling frequency. The paper uses 100 ms sampling frequency (freq=10).
    iter = 10; # Number of iterations for the microprice adjustment function
    microprice = microprice_function(quotes = qq[,c("Date","Time","mid","spr","imb","Bid.Size","Ask.Size","Qualifiers"),with=F], 
                                     imbStates = imbStates, iter = iter, ticker = ticker, outfolder = outfolder, freq = freq);
    
    # Saving micro-price file
    save(list=c("microprice"),file=paste0(outfolder,ticker));
  }
  if(microprice.return){
    if(!silent) print(paste0(qq$Date[1],"_",ticker));
    return(1);
  } 
  
  # Assigning micro-price
  qq$mic = NA;
  qq$S = NA;
  qq$I = NA;
  if(is.character(microprice)){
    print(paste0(ticker," ",qq$Date[1],": Singular matrix error"));
  } else {
    qq$S[qq$spr > microprice$spr.breaks[1] & qq$spr < tail(microprice$spr.breaks,1) & !is.na(qq$spr)] = findInterval(qq$spr[qq$spr>microprice$spr.breaks[1] & qq$spr<tail(microprice$spr.breaks,1) & !is.na(qq$spr)], microprice$spr.breaks);
    for(s in 1:(length(microprice$spr.breaks)-1)){
      spreadSubset = qq$S == s & !is.na(qq$spr) & !is.na(qq$S);
      qq$I[spreadSubset] = findInterval(qq$imb[spreadSubset],microprice$imb.breaks[[s]], rightmost.closed = T);
    }
    qq$state = match(paste(qq$S,qq$I,sep="_"),paste(microprice$G$S,microprice$G$I,sep="_"));
    qq$mic[!is.na(qq$state)] = qq$mid[!is.na(qq$state)] + microprice$G$G[qq$state[!is.na(qq$state)]];
  }
  
  # Trades
  tt = tq[tq$Type=="Trade" & tq$Stock==ticker,c("Date","Venue","Price","Volume","Qualifiers","Seq.No.","Exch.Time","tickDir","Time","gmtOffset"),with=FALSE]; 
  
  # Converting trade time stamps to seconds past midnight
  time = strptime(tt$Exch.Time,format="%H:%M:%OS");  # Before 2006-10-23, for NYSE stocks, use qq$Time instead of qq$Exch.Time
  tt$Time = (time$hour + tt$gmtOffset)*3600 + time$min*60 + time$sec;
  tt = tt[,c("Exch.Time","gmtOffset") := NULL];
  
  # MATCHING TRADES TO QUOTES 
  # One millisecond backwards and 10 seconds and 5 minutes forward (for price impact and realized spread calculations)
  tt = data.table(tt,key = c("Date","Time"))
  qq$Time = qq$Time+0.001; # Adding one millisecond on quote time stamps in order to match trades to the last quote of the previous millisecond
  setkey(qq,Date,Time);
  ix = qq[tt,roll=TRUE,which=TRUE,mult="last"]; 
  tt$mid = qq$mid[ix];
  tt$wmd = qq$wmd[ix];
  tt$mic = qq$mic[ix];
  tt$spr = qq$spr[ix];
  tt$qQual = qq$Qualifiers[ix];
  tt$Bid = qq$Bid.Price[ix];
  tt$Ask = qq$Ask.Price[ix];
  tt$BuyerID  = qq$BuyerID[ix];
  tt$SellerID = qq$SellerID[ix];
  tt$BidSize = qq$Bid.Size[ix];
  tt$AskSize = qq$Ask.Size[ix];
  
  qq$Time = qq$Time-10; # Subtracting ten seconds on quote time stamps in order to match trades to the quotes 10 seconds after the trade
  setkey(qq,Date,Time);
  ix = qq[tt,roll=TRUE,which=TRUE,mult="last"]; 
  tt$mid10s = qq$mid[ix];
  tt$wmd10s = qq$wmd[ix];
  tt$mic10s = qq$mic[ix];
  tt$qQual10s = qq$Qualifiers[ix];
  
  qq$Time = qq$Time-290; # Subtracting another 290 seconds on quote time stamps in order to match trades to the quotes 5 minutes after the trade
  setkey(qq,Date,Time);
  ix = qq[tt,roll=TRUE,which=TRUE,mult="last"]; 
  tt$mid5min = qq$mid[ix];
  tt$wmd5min = qq$wmd[ix];
  tt$mic5min = qq$mic[ix];
  tt$qQual5min = qq$Qualifiers[ix];
  
  # TRADE FILTERING
  # (a) Before any other filtering is done: 
  # - exclude first and last five minutes of trading
  # - exclude dark trades
  # - exclude block trades
  tt = tt[tt$Time > (9.5*3600 + 5*60) & tt$Time < (16*3600 - 5*60) & tt$Venue!="ADF" & tt$Volume < 10000];
  
  # (b) Filtering with respect to qualifiers
  setnames(tt,"Qualifiers","tQual")
  t_subset1 = !grepl("[GV",tt$tQual,fixed=T) | !grepl("[LST",tt$tQual,fixed=T) | !(grepl("@   ",tt$tQual,fixed=T) | grepl("@F  ",tt$tQual,fixed=T) | grepl("@F I",tt$tQual,fixed=T) | grepl("   I",tt$tQual,fixed=T) | grepl(" F  ",tt$tQual,fixed=T) | grepl(" F I",tt$tQual,fixed=T) | grepl("@  I",tt$tQual,fixed=T));
  t_subset2 = !grepl("DPT[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("SOT[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("XSW[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("RCK[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("SLD[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("XTR[CTS_QUAL]",tt$tQual,fixed=T) & !grepl("XO [CTS_QUAL]",tt$tQual,fixed=T);
  t_subset3 = !grepl("AGX[PRC_QL2]",tt$tQual,fixed=T) & !grepl("STP[PRC_QL2]",tt$tQual,fixed=T)  & !grepl("NBL[PRC_QL2]",tt$tQual,fixed=T) & !grepl("SPC[PRC_QL2]",tt$tQual,fixed=T)  & !grepl("B/W[PRC_QL2]",tt$tQual,fixed=T);

  # (c) Handling trade corrections: 
  # Consider correction messages with a positive Seq.No.
  # Match it to the trade with same date and same Seq.No.
  # Remove such trades
  tt$corr = F;
  corr = tq[tq$Type=="Correction" & tq$Seq.No.>0 & tq$Stock==ticker,];
  if(nrow(corr)>0) for(cc in 1:nrow(corr)){
    ix = which(tt$Seq.No.==corr$Seq.No.[cc] & tt$date==corr$Date.L.[cc] & tt$Ex.Cntrb.ID==corr$Ex.Cntrb.ID[cc])
    tt$corr[ix] = T;
  }
  t_subset4 = !tt$corr;
  
  # (d) Missing and negative prices and volumes
  t_subset5 = !is.na(tt$Price) & !is.na(tt$Volume) & tt$Price>0 & tt$Volume>0;
  
  # (e) Merging all trade filters
  t_subset = t_subset1 & t_subset2 & t_subset3 & t_subset4 & t_subset5;
  
  # QUOTE FILTERS
  # (a) Filtering with respect to qualifiers
  q_subset1 = !grepl("[PRC_QL_CD]",tt$qQual,fixed=T) | grepl("R  [PRC_QL_CD]",tt$qQual,fixed=T) | grepl("R  [PRC_QL_CD]",tt$qQual,fixed=T) | grepl("   [PRC_QL_CD]",tt$qQual,fixed=T);
  q_subset2 = !grepl("[PRC_QL3]",tt$qQual,fixed=T)   | grepl("R  [PRC_QL3]",tt$qQual,fixed=T)   | grepl("R  [PRC_QL3]",tt$qQual,fixed=T)   | grepl("   [PRC_QL3]",tt$qQual,fixed=T) | grepl("LPB[PRC_QL3]",tt$qQual,fixed=T) | grepl("RPB[PRC_QL3]",tt$qQual,fixed=T);
  q_subset3 = !(grepl("A[GV1_FLAG]",tt$qQual,fixed=T)    | grepl("B[GV1_FLAG]",tt$qQual,fixed=T)    | grepl("C[GV1_FLAG]",tt$qQual,fixed=T));
  q_subset4 = !(grepl("TH [CTS_QUAL]",tt$qQual,fixed=T)  | grepl("IND[CTS_QUAL]",tt$qQual,fixed=T)  | grepl("O  [CTS_QUAL]",tt$qQual,fixed=T));

  # (b) Filtering out crossed, locked and very large spreads, as well as missing values in mid, mic, and spr
  q_subset5 = tt$spr>0 & tt$spr <= 5 & !is.na(tt$mid) & !is.na(tt$mic) & !is.na(tt$spr);

  # (c) Merging all trade filters
  q_subset = q_subset1 & q_subset2 & q_subset3 & q_subset4 & q_subset5;
  q_notlocked = round(tt$spr,2)!=0;
  q_locked_sameVenue = sum(round(tt$spr,2)==0 & tt$BuyerID == tt$SellerID)/sum(round(tt$spr,2)!=0);
  
  q_subset_10s = (!grepl("[PRC_QL_CD]",tt$qQual10s,fixed=T) | grepl("R  [PRC_QL_CD]",tt$qQual10s,fixed=T) | grepl("   [PRC_QL_CD]",tt$qQual10s,fixed=T)) &
    (!grepl("[PRC_QL3]",tt$qQual10s,fixed=T)       | grepl("R  [PRC_QL3]",tt$qQual10s,fixed=T)   | grepl("   [PRC_QL3]",tt$qQual10s,fixed=T) | grepl("LPB[PRC_QL3]",tt$qQual10s,fixed=T) | grepl("RPB[PRC_QL3]",tt$qQual10s,fixed=T)) &
    (!(grepl("A[GV1_FLAG]",tt$qQual10s,fixed=T)    | grepl("B[GV1_FLAG]",tt$qQual10s,fixed=T)    | grepl("C[GV1_FLAG]",tt$qQual10s,fixed=T))) &
    (!(grepl("TH [CTS_QUAL]",tt$qQual10s,fixed=T)  | grepl("IND[CTS_QUAL]",tt$qQual10s,fixed=T)  | grepl("O  [CTS_QUAL]",tt$qQual10s,fixed=T))) &
    tt$spr10s>0 & tt$spr10s <= 5 & !is.na(tt$mid10s) & !is.na(tt$mic10s) & !is.na(tt$spr10s);
  
  q_subset_5min = (!grepl("[PRC_QL_CD]",tt$qQual5min,fixed=T) | grepl("R  [PRC_QL_CD]",tt$qQual5min,fixed=T) | grepl("   [PRC_QL_CD]",tt$qQual5min,fixed=T)) &
    (!grepl("[PRC_QL3]",tt$qQual5min,fixed=T)       | grepl("R  [PRC_QL3]",tt$qQual5min,fixed=T)   | grepl("   [PRC_QL3]",tt$qQual5min,fixed=T) | grepl("LPB[PRC_QL3]",tt$qQual5min,fixed=T) | grepl("RPB[PRC_QL3]",tt$qQual5min,fixed=T)) &
    (!(grepl("A[GV1_FLAG]",tt$qQual5min,fixed=T)    | grepl("B[GV1_FLAG]",tt$qQual5min,fixed=T)    | grepl("C[GV1_FLAG]",tt$qQual5min,fixed=T))) &
    (!(grepl("TH [CTS_QUAL]",tt$qQual5min,fixed=T)  | grepl("IND[CTS_QUAL]",tt$qQual5min,fixed=T)  | grepl("O  [CTS_QUAL]",tt$qQual5min,fixed=T))) &
    tt$spr5min>0 & tt$spr5min <= 5 & !is.na(tt$mid5min) & !is.na(tt$mic5min) & !is.na(tt$spr5min);
  
  # Counting filtering --> used for numbers reported in Table A.1
  filterStats = c(length(t_subset),1-c(sum(t_subset1),sum(t_subset2),sum(t_subset3),sum(t_subset4),sum(t_subset5),sum(t_subset))/length(t_subset),
                  1-c(sum(t_subset & q_subset2),sum(t_subset & q_subset3),sum(t_subset & q_subset4),sum(t_subset & q_subset5),sum(t_subset & q_notlocked),sum(t_subset & q_subset),
                  sum(t_subset & q_subset_10s),sum(t_subset & q_subset_5min))/sum(t_subset),q_locked_sameVenue);
  
  # Filtering
  tt$mid[!q_subset] = NA;
  tt$wmd[!q_subset] = NA;
  tt$mic[!q_subset] = NA;
  tt$spr[!q_subset] = NA;

  tt$mid10s[!q_subset_10s] = NA;
  tt$wmd10s[!q_subset_10s] = NA;
  tt$mic10s[!q_subset_10s] = NA;

  tt$mid5min[!q_subset_5min] = NA;
  tt$wmd5min[!q_subset_5min] = NA;
  tt$mic5min[!q_subset_5min] = NA;
  
  # MERGING VENUES
  tt$Venue[tt$Venue == "ASE"] = "NYS"; # NYS and ASE are treated as the same venue
  tt$Venue[tt$Venue == "THM"] = "NAS"; # NAS and THM are treated as the same venue
  
  tt$Stock = ticker;
  
  # RETURNING TRADES
  if(!silent) print(ticker);
  
  return(tt[t_subset,c("Stock","Date","Time","Price","Volume","tickDir","Venue",
                              "mid","wmd","mic","spr",
                              "mid10s","wmd10s","mic10s",
                              "mid5min","wmd5min","mic5min"),with=F]);
  
}

# This function returns quote observations sampled in each second.
# The output includes the fundamental value estimators mid, wmd, and mic
quoteFilter_function = function(ticker, silent=TRUE, microprice.return=F, outfolder="", load.microprice = F, microprice.folder){
  # Quotes
  qq = tq[tq$Type=="Quote" & tq$Stock==ticker,c("Date","Quote.Time","Time","gmtOffset","BuyerID","Bid.Price","Bid.Size","SellerID","Ask.Price","Ask.Size","Qualifiers"), with=FALSE];
  
  # Removing quotes that are not the last in a given millisecond
  qq = qq[!duplicated(qq,by=c("Date","Quote.Time"),fromLast = TRUE)];
  
  # Converting time stamps to seconds past midnight
  time = strptime(qq$Quote.Time,format="%H:%M:%OS"); # Before 2006-10-23, for NYSE stocks, use qq$Time instead of qq$Quote.Time
  qq$Time = (time$hour + qq$gmtOffset)*3600 + time$min*60 + time$sec;
  qq[,c("Quote.Time","gmtOffset"):=NULL];
  
  # Fundamental value estimators 
  qq$mid = (qq$Bid.Price + qq$Ask.Price)/2; # Midpoint
  qq$spr = round(qq$Ask.Price - qq$Bid.Price,4); # Nominal spread. Important to not round to two digits for pre-decimalization sample
  qq$imb = qq$Bid.Size/(qq$Ask.Size + qq$Bid.Size); # Order book imbalance
  qq$wmd = qq$mid + qq$spr * (qq$imb - 0.5); # Weighted midpoint
  
  # Micro-price (following Stoikov, 2018)
  if(load.microprice) load(paste0(microprice.folder,ticker)) else {
    imbStates = 9; # Number of order book imbalance states
    freq = 1; # 1 implies 1 second sampling frequency. The paper uses 100 ms sampling frequency (freq=10).
    iter = 10; # Number of iterations for the microprice adjustment function
    microprice = microprice_function(quotes=qq[,c("Date","Time","mid","spr","imb","Bid.Size","Ask.Size","Qualifiers"),with=F],imbStates=imbStates,iter=10,ticker=ticker,outfolder=outfolder, freq=freq);
    
    # Saving micro-price file
    save(list=c("microprice"),file=paste0(outfolder,ticker));
  }
  if(microprice.return){
    if(!silent) print(paste0(qq$Date[1],"_",ticker));
    return(1);
  } 
  
  # Retaining the first quote of each second
  qq = qq[c(TRUE,trunc(qq$Time[-1],0)>trunc(qq$Time[-nrow(qq)],0))];
  
  # Assigning micro-price
  qq$mic = NA;
  qq$S = NA;
  qq$I = NA;
  if(is.character(microprice)){
    print(paste0(ticker," ",qq$Date[1],": Singular matrix error"));
  } else {
    qq$S[qq$spr > microprice$spr.breaks[1] & qq$spr < tail(microprice$spr.breaks,1) & !is.na(qq$spr)] = findInterval(qq$spr[qq$spr>microprice$spr.breaks[1] & qq$spr<tail(microprice$spr.breaks,1) & !is.na(qq$spr)], microprice$spr.breaks);
    for(s in 1:(length(microprice$spr.breaks)-1)){
      spreadSubset = qq$S == s & !is.na(qq$spr) & !is.na(qq$S);
      qq$I[spreadSubset] = findInterval(qq$imb[spreadSubset],microprice$imb.breaks[[s]], rightmost.closed = T);
    }
    qq$state = match(paste(qq$S,qq$I,sep="_"),paste(microprice$G$S,microprice$G$I,sep="_"));
    qq$mic[!is.na(qq$state)] = qq$mid[!is.na(qq$state)] + microprice$G$G[qq$state[!is.na(qq$state)]];
  }

  # QUOTE FILTERS
  # (a) Filtering with respect to qualifiers
  q_subset1 = !grepl("[PRC_QL_CD]",qq$Qualifiers,fixed=T) | grepl("R  [PRC_QL_CD]",qq$Qualifiers,fixed=T) | grepl("R  [PRC_QL_CD]",qq$Qualifiers,fixed=T) | grepl("   [PRC_QL_CD]",qq$Qualifiers,fixed=T);
  q_subset2 = !grepl("[PRC_QL3]",qq$Qualifiers,fixed=T)   | grepl("R  [PRC_QL3]",qq$Qualifiers,fixed=T)   | grepl("R  [PRC_QL3]",qq$Qualifiers,fixed=T)   | grepl("   [PRC_QL3]",qq$Qualifiers,fixed=T) | grepl("LPB[PRC_QL3]",qq$Qualifiers,fixed=T) | grepl("RPB[PRC_QL3]",qq$Qualifiers,fixed=T);
  q_subset3 = !(grepl("A[GV1_FLAG]",qq$Qualifiers,fixed=T)    | grepl("B[GV1_FLAG]",qq$Qualifiers,fixed=T)    | grepl("C[GV1_FLAG]",qq$Qualifiers,fixed=T));
  q_subset4 = !(grepl("TH [CTS_QUAL]",qq$Qualifiers,fixed=T)  | grepl("IND[CTS_QUAL]",qq$Qualifiers,fixed=T)  | grepl("O  [CTS_QUAL]",qq$Qualifiers,fixed=T));
  
  # (b) Filtering out crossed, locked and very large spreads, as well as missing values in mid, mic, and spr
  q_subset5 = qq$spr>0 & qq$spr <= 5 & !is.na(qq$mid) & !is.na(qq$mic) & !is.na(qq$spr);
  
  # (c) Merging all quote filters
  q_subset = q_subset1 & q_subset2 & q_subset3 & q_subset4 & q_subset5;

  # RETURNING QUOTES
  if(!silent) print(ticker);
  qq$Stock = ticker;
  
  return(qq[q_subset,c("Stock","Date","Time","Bid.Price","Ask.Price","Bid.Size","Ask.Size",
                       "mid","wmd","mic"),with=F]);
}

# The micro-price is due to Stoikov (2018). This implementation deviates somewhat from his, see Appendix A
microprice_function = function(quotes, imbStates=9, iter=10, ticker="DUMMY", outfolder="", freq=1){
  # quotes is the qq data, containing the variables Date, Time, Bid.Price, Bid.Size, Ask.Price, Ask.Size
  # imbStates is the number of order book imbalance states. It should be an odd number exceeding 1.
  # iter is the number of iterations used for the microprice adjustment function
  # ticker is used for file names of output
  # outfolder is where the output is saved
  # freq is the quote sampling frequency, expressed as number of observations per second
  
  # EQUISPACED QUOTES (excluding quotes outside the time range 09:05 - 15:55)
  periods = data.table(Date=rep(unique(quotes$Date),each=((16-9.5)*3600-601)*freq+1), 
                       Time=(((9.5*3600+300)*freq):((16*3600-301)*freq))/freq);
  setkeyv(periods,c("Date","Time"));
  setkeyv(quotes,c("Date","Time"));
  quotes = quotes[periods,roll=T]
  
  # QUOTE FILTERS
  q_subset1 = !grepl("[PRC_QL_CD]",quotes$Qualifiers,fixed=T) | grepl("R  [PRC_QL_CD]",quotes$Qualifiers,fixed=T) | grepl("R  [PRC_QL_CD]",quotes$Qualifiers,fixed=T) | grepl("   [PRC_QL_CD]",quotes$Qualifiers,fixed=T) | grepl("LM [PRC_QL_CD]",quotes$Qualifiers,fixed=T);
  q_subset2 = !grepl("[PRC_QL3]",quotes$Qualifiers,fixed=T)   | grepl("R  [PRC_QL3]",quotes$Qualifiers,fixed=T)   | grepl("R  [PRC_QL3]",quotes$Qualifiers,fixed=T)   | grepl("   [PRC_QL3]",quotes$Qualifiers,fixed=T) | grepl("LPB[PRC_QL3]",quotes$Qualifiers,fixed=T) | grepl("RPB[PRC_QL3]",quotes$Qualifiers,fixed=T);
  q_subset3 = !(grepl("A[GV1_FLAG]",quotes$Qualifiers,fixed=T)    | grepl("B[GV1_FLAG]",quotes$Qualifiers,fixed=T)    | grepl("C[GV1_FLAG]",quotes$Qualifiers,fixed=T));
  q_subset4 = !(grepl("TH [CTS_QUAL]",quotes$Qualifiers,fixed=T)  | grepl("IND[CTS_QUAL]",quotes$Qualifiers,fixed=T)  | grepl("O  [CTS_QUAL]",quotes$Qualifiers,fixed=T));
  q_subset5 = quotes$spr>0 & quotes$spr <= 5 & rowSums(is.na(quotes[,c("mid","spr","imb","Ask.Size","Bid.Size"),with=F]))==0 & quotes$Bid.Size > 0 & quotes$Ask.Size > 0;
  quotes    = quotes[q_subset1 & q_subset2 & q_subset3 & q_subset4 & q_subset5];
  
  # DISCRETIZING SPREADS
  # commonSpreads and rareSpreads are vectors of spread levels that are considered for spread states
  # spr.breaks is a vector used to allocate spread levels to the spread states
  # spr.names is a vector of names for each spread state
  
  # Determining "common spreads" (>1% of the quote observations)
  commonSpreads = as.numeric(names(which((table(quotes$spr)/nrow(quotes))>.01)));
  if(length(commonSpreads)==0){ # When no spread level has more than 1%, use deciles
    commonSpreads = quantile(quotes$spr,seq(0,1,.1));
    spr.breaks = commonSpreads;
    spr.names = paste0(100*spr.breaks[-11],"-",100*spr.breaks[-1]-c(rep(1,9),0)," ticks");
    spreadDeciles = T;
  } else spreadDeciles = F;
  
  # Determining "rare spreads" (>0.01% of the quote observations)
  rareSpreads = as.numeric(names(which((table(quotes$spr[quotes$spr < commonSpreads[1] | quotes$spr > tail(commonSpreads,1)])/nrow(quotes))>.0001)));
  
  if(length(rareSpreads)>0){
    # If rare spreads that are smaller than the common spreads jointly exceed 1%, they get a category of its own
    # If not, they are merged with the lowest common spread category
    smallRareSpreads = sum(quotes$spr %in% rareSpreads[rareSpreads<commonSpreads[1]])/nrow(quotes) > 0.01;
    
    # If rare spreads that are larger than the common spreads jointly exceed 1%, they get a category of its own
    # If not, they are merged with the highest common spread category
    largeRareSpreads = sum(quotes$spr %in% rareSpreads[rareSpreads>tail(commonSpreads,1)])/nrow(quotes) > 0.01;
    
    if(smallRareSpreads & largeRareSpreads){
      spr.breaks = c(head(rareSpreads,1)-0.001, commonSpreads-0.001, head(rareSpreads[rareSpreads>tail(commonSpreads,1)],1)-0.001, max(rareSpreads)+0.001);
      spr.names  = c(paste0("<",100*commonSpreads[1]," ticks"), paste0(100*commonSpreads," ticks"), paste0(">",100*tail(commonSpreads,1)," ticks"));
    } 
    if(!smallRareSpreads & largeRareSpreads){
      spr.breaks = c(commonSpreads-0.001, head(rareSpreads[rareSpreads > tail(commonSpreads,1)],1)-0.001, max(rareSpreads)+0.001);
      if(sum(rareSpreads < commonSpreads[1])>0){
        spr.breaks[1] = rareSpreads[1]-0.001;
        spr.names     = c(paste0("\u2264",100*commonSpreads[1]," ticks"), paste0(100*commonSpreads[-1]," ticks"), paste0(">",100*tail(commonSpreads,1)," ticks"));
      } else {
        spr.names = c(paste0(100*commonSpreads," ticks"),paste0(">",100*tail(commonSpreads,1)," ticks"));
      }
    } 
    if(smallRareSpreads & !largeRareSpreads){
      spr.breaks = c(head(rareSpreads,1)-0.001,commonSpreads-0.001,max(rareSpreads)+0.001);
      if(sum(rareSpreads > tail(commonSpreads,1))>0){
        spr.names = c(paste0("<",100*commonSpreads[1]," ticks"), paste0(100*commonSpreads[-length(commonSpreads)]," ticks"), paste0("\u2265",100*tail(commonSpreads,1)," ticks"));
      } else {
        spr.names = c(paste0("<",100*commonSpreads[1]," ticks"), paste0(100*commonSpreads," ticks"))
      }
    } 
    if(!smallRareSpreads & !largeRareSpreads & !spreadDeciles){
      spr.breaks = c(commonSpreads-0.001,max(rareSpreads)+0.001);
      spr.names = c(paste0(100*commonSpreads," ticks"))
      if(sum(rareSpreads < commonSpreads[1])>0){
        spr.breaks[1] = rareSpreads[1]-0.001;
        spr.names[1] = paste0("\u2264",100*commonSpreads[1]," ticks"); 
      }
      if(sum(rareSpreads > tail(commonSpreads,1))>0) spr.names[length(spr.names)] = paste0("\u2265",100*tail(commonSpreads,1)," ticks");
    }   
  } else {
    spr.breaks = c(commonSpreads - 0.001, max(commonSpreads) + 0.001);
    spr.names  = c(paste0(100*commonSpreads," ticks"))
  }
  if(spr.names[1] == "1 ticks") spr.names[1] = "1 tick";
  
  # Assigning spread state to each quote obersvation
  quotes$S = findInterval(quotes$spr,spr.breaks);
  
  # Descriptive stats for each spread state
  spr.categories = sapply(1:max(quotes$S),function(x){
    c(sum(quotes$S==x),
      min(quotes$spr[quotes$S==x]),
      median(quotes$spr[quotes$S==x]),
      mean(quotes$spr[quotes$S==x]),
      max(quotes$spr[quotes$S==x]))
  })
  dimnames(spr.categories) = list(c("n","min","median","mean","max"),if(max(quotes$S)>length(spr.names)) c(spr.names,"excluded") else spr.names);
  
  # Deleting quote observations with spreads that are too rare to calculate the microprice
  quotes = quotes[quotes$S %in% 1:length(spr.names)];
  
  # DISCRETIZING THE ORDER BOOK IMBALANCE
  # The order book imbalance state breakpoints are allowed to vary across spread states
  # imb.breaks is a list of vectors used to allocate order imbalance levels to the order imbalance states
  imb.breaks = vector("list",length(unique(quotes$S)));
  nn = imbStates - 1;
  for(s in 1:length(unique(quotes$S))){
    spreadSubset = quotes$S==unique(quotes$S)[s];
    
    # The central order book imbalance state is predefined to cover the interval [0.45, 0.55]
    # lower.breaks determine the breakpoints for imbalances > 0.55
    # upper.breaks determine the breakpoints for imbalances < 0.45
    # In rare cases where [0.45, 0.55] is not populated, only two states are used: lower and upper 
    lower.breaks = if(sum(quotes$imb<.45 & spreadSubset)>0) quantile(quotes$imb[quotes$imb<.45 & spreadSubset],seq(2/nn,1-2/nn,2/nn)) else rep(0,nn/2-1);
    upper.breaks = if(sum(quotes$imb>.55 & spreadSubset)>0) quantile(quotes$imb[quotes$imb>.55 & spreadSubset],seq(2/nn,1-2/nn,2/nn)) else rep(1,nn/2-1);
    imb.breaks[[s]] = c(0,lower.breaks,.45,.55,upper.breaks,1);
    
    # Assigning order imbalance state to each quote obersvation
    quotes$I[spreadSubset] = findInterval(quotes$imb[spreadSubset],imb.breaks[[s]], rightmost.closed = T);
  }
  
  # MIDPOINT CHANGES
  quotes$dM = round(c(diff(quotes$mid),NA),3);
  
  # To avoid the influence of outliers, midpoint changes that exceed the median spread of the highest spread state are set equal to that median
  dm_max = round(spr.categories["median",ncol(spr.categories)],5); 
  quotes$dM[quotes$dM >  dm_max] =  dm_max;
  quotes$dM[quotes$dM < -dm_max] = -dm_max;
  
  # ESTIMATION
  
  # 1. Symmetrization
  # Appending a mirror image of data set 
  quotes2    = quotes;
  quotes2$I  = imbStates + 1 - quotes$I;
  quotes2$dM = -quotes$dM;
  
  # States in current and next period
  quotes$x = paste(quotes$S,quotes$I,sep="_");
  quotes$y = c(quotes$x[-1],NA);
  
  quotes2$x  = paste(quotes2$S,quotes2$I,sep="_");
  quotes2$y  = c(quotes2$x[-1],NA);
  
  # Excluding the last observation (because the next-period obs is NA)
  quotes  = quotes[-nrow(quotes)];
  quotes2 = quotes2[-nrow(quotes2)];
  
  # Merging the two data sets
  quotes = rbind(quotes,quotes2);
  rm(quotes2);
  
  # Making sure that all states exist in both x and y (such that Q is quadratic)
  quotes = quotes[quotes$x %in% unique(quotes$y) & quotes$y %in% unique(quotes$x)]; 
  
  # 2. Transition probability estimation
  # Transient states
  # SI captures all combinations of S and I states
  SI = names(table(quotes$x)); 
  mn = length(SI);
  
  # To obtain the observed probabilities of transitions between states, first count the frequency of each SI
  # This matrix has dimension mn x mn
  nSI = table(quotes$x,quotes$y);
  
  # Then count the same frequency conditional on that the midpoint DOES NOT change
  # This matrix may not have dimension mn x mn
  nSI_Mzero = table(quotes$x[quotes$dM == 0], quotes$y[quotes$dM == 0]);
  # Adding empty rows and columns for unpopulated combinations of x and y to obtain an mn x mn matrix
  nSI_Mzero = rbind(nSI_Mzero,matrix(0, ncol=ncol(nSI_Mzero), nrow=sum(!(SI %in% dimnames(nSI_Mzero)[[1]])), dimnames=list(SI[!(SI %in% dimnames(nSI_Mzero)[[1]])],NULL))); 
  nSI_Mzero = cbind(nSI_Mzero,matrix(0, nrow=nrow(nSI_Mzero), ncol=sum(!(SI %in% dimnames(nSI_Mzero)[[2]])), dimnames=list(NULL,SI[!(SI %in% dimnames(nSI_Mzero)[[2]])]))); 
  # Putting the states in the correct order
  nSI_Mzero = nSI_Mzero[SI,SI]; 
  # Obtaining probabilities for matrix Q
  Q   = nSI_Mzero / rowSums(nSI);
  
  # Absorbing states
  K = sort(unique(quotes$dM[quotes$dM != 0])); 
  # R is a mn x k matrix
  R = table(quotes$x, quotes$dM)[,colnames(table(quotes$x, quotes$dM)) != 0] / rowSums(nSI);
  # TT is the T matrix in the manuscript. It is a mn x mn matrix.
  TT  = (nSI - nSI_Mzero) / rowSums(nSI);
  
  rm(nSI_Mzero, nSI)
  
  # 3. Computing the midpoint adjustment
  # Check if 1-Q is invertible, and return a silent error otherwise
  invertible = tryCatch(solve(diag(mn)-Q), error=function(e) "No")
  if(is.character(invertible)) return("ERROR");
  
  # Computing G1 and B
  G1 = solve(diag(mn) - Q) %*% R %*% K;
  B  = solve(diag(mn) - Q) %*% TT;
  
  # Iterative approach to obtain the midpoint adjustment for each combination of S and I states
  Gx = matrix(ncol=iter,nrow=mn,dimnames=list(SI,1:iter));
  for(i in 1:iter){
    Gx[,i] = if(i==1) G1 else B %*% Gx[,i-1];
  }
  states = do.call(rbind,strsplit(SI,split="_"));
  Gstar = data.frame(S=states[,1], I=as.numeric(states[,2]), Gstar=rowSums(Gx));
  
  # Plotting the adjustment function (as in Figure IA.3 in the internet appendix)
  plot_g_star = ggplot(data = Gstar, aes(x=I, y=Gstar)) + geom_line(aes(colour=S)) + 
    scale_color_manual(values = colorRampPalette(brewer.pal(max(min(length(spr.names),8),3), "Set1"))(length(spr.names)), labels = spr.names) + 
    labs(x = "Order book imbalance state", y="G*") + ggtitle(ticker, sub=paste0("Average midprice ",median(quotes$mid)));
  ggsave(paste0(outfolder,ticker,"_microprice.png"), h=5,w=8, plot_g_star);
  
  return(list(Gx=Gx, G=Gstar, imb.breaks=imb.breaks, spr.breaks=spr.breaks, 
              transient.states=SI, absorptive.states=K, imbStates=imbStates, mn=mn, 
              spread.stats=spr.categories));
}

# WEIGHTED VARIANCE AND WEIGHTED t-TEST
# Copied from https://www.r-bloggers.com/weighted-t-test-in-r/
# weighted variance, inspired by a function from Gavin Simpson on R-Help
var.wt <- function(x, w, na.rm = FALSE) {
  if (na.rm) {
    w <- w[i <- !is.na(x)]
    x <- x[i]
  }
  sum.w <- sum(w)
  return((sum(w*x^2) * sum.w - sum(w*x)^2) / (sum.w^2 - sum(w^2)))
}

weighted.t.test <- function(x, w, mu=0, conf.level = 0.95, alternative="two.sided", na.rm=TRUE) {
  if(!missing(conf.level) & (length(conf.level) != 1 || !is.finite(conf.level) || conf.level < 0 || conf.level > 1))
    stop("'conf.level' must be a single number between 0 and 1")
  
  if (na.rm) {
    w <- w[i <- !is.na(x)]
    x <- x[i]
  }
  
  # to achieve consistent behavior in loops, return NA-structure in case of complete missings
  if (sum(is.na(x)) == length(x)) return(list(estimate=NA, se=NA, conf.int=NA, statistic=NA, df=NA, p.value=NA))
  
  # if only one value is present: this is the best estimate, no significance test provided
  if (sum(!is.na(x)) == 1) {
    warning("Warning weighted.t.test: only one value provided; this value is returned without test of significance!", call.=FALSE)
    return(list(estimate=x[which(!is.na(x))], se=NA, conf.int=NA, statistic=NA, df=NA, p.value=NA))
  }
  
  x.w <- weighted.mean(x,w, na.rm=na.rm)
  var.w <- var.wt(x,w, na.rm=na.rm)
  df <- length(x)-1
  t.value <- sqrt(length(x))*((x.w-mu)/sqrt(var.w))
  se <- sqrt(var.w)/sqrt(length(x))
  
  if (alternative == "less") {
    pval <- pt(t.value, df)
    cint <- c(-Inf, x.w + se*qt(conf.level, df) )
  }
  else if (alternative == "greater") {
    pval <- pt(t.value, df, lower.tail = FALSE)
    cint <- c(x.w - se * qt(conf.level, df), Inf)
  }
  else {
    pval <- 2 * pt(-abs(t.value), df)
    alpha <- 1 - conf.level
    cint <- x.w + se*qt(1 - alpha/2, df)*c(-1,1)
  }
  
  names(t.value) <- "t"
  return(list(estimate=x.w, se=se, conf.int=cint, statistic=t.value, df=df, p.value=pval))
}

# This function is used to generate summary stats for Table 1
distributionStats = function(x,...){
  c(weighted.mean(x,...,na.rm=T),sd(x,na.rm=T),quantile(x,c(0.05,0.25,0.50,0.75,0.95)))
}

# This function generates the average spreads and spread biases that are reported throughout the paper
spreadBias = function(rows){
  if(length(rows)<10) return(rep(NA,15));
  
  weights = tt$USDvol[rows];
  
  espr_mid_mean = weighted.mean(tt$espr_mid[rows], w=weights);
  espr_wmd_mean = weighted.mean(tt$espr_wmd[rows], w=weights);
  espr_mic_mean = weighted.mean(tt$espr_mic[rows], w=weights);
  
  wmd_bias = weighted.t.test((tt$espr_mid[rows] - tt$espr_wmd[rows]), w=weights, mu=0);
  mic_bias = weighted.t.test((tt$espr_mid[rows] - tt$espr_mic[rows]), w=weights, mu=0);
  
  c(espr_mid_mean,
    espr_wmd_mean,
    espr_mic_mean,
    wmd_bias$estimate,
    wmd_bias$estimate/espr_wmd_mean,
    wmd_bias$conf.int/espr_wmd_mean,
    mic_bias$estimate,
    mic_bias$estimate/espr_mic_mean,
    mic_bias$conf.int/espr_mic_mean,
    weighted.mean(tt$qspr[rows], w=weights),
    median(tt$spr[rows]),
    mean(tt$Price[rows]),
    length(rows),
    sum(weights));
}

# This function compares liquidity portfolio assignments obtained with the specified estimator to those of the midpoint
liqQuintileFunction = function(rows,estimator="micSpr"){
  liqQuintiles = cbind(findInterval(liqStockDate$midSpr[rows],quantile(liqStockDate$midSpr[rows],seq(0,.9,.2))),
                       findInterval(liqStockDate[rows,estimator],quantile(liqStockDate[rows,estimator],seq(0,.9,.2))))
  out = rep(0,9);
  tmp = table(liqQuintiles[,1]-liqQuintiles[,2])
  out[match(names(tmp),-4:4)] = tmp;
  out;
}

# This function is used for the venue rank analysis in Section 4.3
venuerank = function(stockdate){
  rows=which(liqStockVenueDate$Stock==stockdate[1] & liqStockVenueDate$Date==stockdate[2] & 
               liqStockVenueDate$Venue!="MID")
  out = cbind(as.data.frame(liqStockVenueDate$Venue[rows]),
              rank(liqStockVenueDate$midSpr[rows]),
              rank(liqStockVenueDate$wmdSpr[rows]),
              rank(liqStockVenueDate$micSpr[rows]));
  dimnames(out)[[2]] = c("venue","midRank","wmdRank","micRank");
  return(out);
}

# This function is used for comparing the weighted means reported in Table 5
weighted.means.2var = function(dt,var1,var2,weight.var = "none",subset = TRUE, cluster = NULL){
  if(is.data.table(dt)) dt = as.data.frame(dt);
  if(is.null(cluster)){
    if(weight.var == "none"){
      mean1 = mean(x=dt[subset,var1],na.rm=T);
      mean2 = mean(x=dt[subset,var2],na.rm=T);
      tTest = t.test(x=dt[subset,var1] - dt[subset,var2],na.rm=T);
      out = c(mean1,mean2,tTest$estimate[1],tTest$estimate[1]/mean2,tTest$stat,tTest$p.value);
    } else {
      mean1 = weighted.mean(x=dt[subset,var1],w=dt[subset,weight.var],na.rm=T);
      mean2 = weighted.mean(x=dt[subset,var2],w=dt[subset,weight.var],na.rm=T);
      tTest = wtd.t.test(x=dt[subset,var1] - dt[subset,var2], weight=dt[subset,weight.var]);
      out = c(mean1,mean2,tTest$additional[1],tTest$additional[1]/mean2,tTest$coeff[c(1,3)]);
    } 
  } else {
    dt$diff = dt[,var1]-dt[,var2];
    if(weight.var == "none"){
      mean1 = mean(x=dt[subset,var1],na.rm=T);
      mean2 = mean(x=dt[subset,var2],na.rm=T);
      tTest = felm(as.formula(paste0("diff ~ 1 | 0 | 0 | ",paste(cluster, collapse=" + "))), data = dt, subset = subset);
    } else {
      mean1 = weighted.mean(x=dt[subset,var1],w=dt[subset,weight.var], na.rm=T);
      mean2 = weighted.mean(x=dt[subset,var2],w=dt[subset,weight.var], na.rm=T);
      tTest = felm(as.formula(paste0("diff ~ 1 | 0 | 0 | ",paste(cluster, collapse=" + "))), data = dt, weights = dt[,weight.var]);
    }  
    out = c(mean1,mean2,tTest$coeff[1],tTest$coeff[1]/mean2,round(c(tTest$STATS$diff$ctval[1],tTest$STATS$diff$cpval[1]),5));
  }
  names(out) = c(var1,var2,"diff","%","t","p");
  return(out);
}
