library(rpart)
library(rpart.plot)
library(partykit)
# Some advanced packages to be aware of
#library(causalTree)
library(permute)
library(maptree)
library(formattable)
library(dplyr)
library(plyr)
library(doBy)
library(reshape2)
library(tidyr)
library(glmnet)
setwd("C:/Users/jlariv/Dropbox/R/UW R code")
mydata <- read.csv("oj.csv")
colnames(mydata)[1-17]
mydata$Q <- exp(mydata$logmove)
#reduce decimal places
is.num=sapply(mydata, is.numeric)
mydata[is.num] = lapply(mydata[is.num], round, 4)
#Keeps a subset of the data which we care about. Can merge demographic data back in later.
smaller <-mydata[c("store","brand","week","feat","price","Q")]
#Look to see what are unique elements of brand
unique(smaller$brand)
#Best possible way of doing the weighted means is using ddply
temp <- ddply(smaller, c('store','week'),function(x) c(weighted_mean = weighted.mean(x$price,x$Q)))
complete = merge(temp,mydata,by=c("store","week"))
#Need to collpse this by brand/price/Q/logmove
complete_tree <- complete[complete$brand=="dominicks",]
dataToPass<- complete_tree[,c("weighted_mean","AGE60","EDUC","ETHNIC","INCOME","HHLARGE","WORKWOM","HVAL150","SSTRDIST","SSTRVOL","CPDIST5","CPWVOL5")]
fit<-rpart(as.formula(weighted_mean ~ .),data=dataToPass,method="anova",cp=0.007)
draw.tree(fit)
#This command draws the tree
dataToPass$leaf = fit$where
#### Alternative code to do this in one line.
#complete$leaf = fit$where
#We find that there are three trees. 2 is the lowest home value and 5 is the highest.
L <- unique(dataToPass$leaf)
#NOTE: This is an issue becuase I had to create a crosswalk with a single variable. It wouldn't keep the store numbers in the df I read into rpart if they weren't used
store_leaf_crosswalk <- summaryBy(leaf ~ HVAL150, data=dataToPass)
complete<- merge(mydata, store_leaf_crosswalk, by.x="HVAL150", by.y="HVAL150")
#Sanity Check regression (This is the first part of question 2)
reg = lm(logmove~log(price)*as.factor(leaf.mean)*feat, data=subset(complete, brand=="tropicana"))
summary(reg)
#leaf specific regressions
#First reshape the data. We want to create a price and feature variable for each store and each week.
temp <- subset(complete[,c("price", "brand","store", "week", "logmove", "leaf.mean", "feat")])
temp <- reshape(temp, timevar = "brand", idvar = c("store", "week", "leaf.mean"), direction = "wide")
#NOTE: This is a useful trick to create unique identifiers if you ever need to
#temp$storeweek <- temp$store*10000+temp$week
# Create a vector of the LHS variables we'll eventually want to use.
varlist <- names(temp)[c(5,8,11)]
#Create a matrix to store the coefficients in
mm <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(mm) <- c("Dom","MinMaid","Trop","leaf")
rownames(mm) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")
featured <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(featured) <- c("Dom","MinMaid","Trop","leaf")
rownames(featured) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")
feature_elasticity <- data.frame(matrix(NA, length(varlist)*length(L), length(L)+1))
colnames(feature_elasticity) <- c("Dom","MinMaid","Trop","leaf")
rownames(feature_elasticity) <- c("Dom2","MinMaid2","Trop2","Dom4","MinMaid4","Trop4","Dom5","MinMaid5","Trop5")
for (i in 1:length(L)) {
temp_df <- subset(temp,leaf.mean==L[i])
regd = lm(logmove.dominicks~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
regm = lm(logmove.minute.maid~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
regt = lm(logmove.tropicana~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
mm$Dom[(i-1)*length(L) +1]=coef(regd)[2]
mm$MinMaid[(i-1)*length(L) +1]=coef(regd)[4]
mm$Trop[(i-1)*length(L) +1]=coef(regd)[6]
mm$leaf[(i-1)*length(L) +1]=L[i]
mm$Dom[(i-1)*length(L) +2]=coef(regm)[2]
mm$MinMaid[(i-1)*length(L) +2]=coef(regm)[4]
mm$Trop[(i-1)*length(L) +2]=coef(regm)[6]
mm$leaf[(i-1)*length(L) +2]=L[i]
mm$Dom[(i-1)*length(L) +3]=coef(regt)[2]
mm$MinMaid[(i-1)*length(L) +3]=coef(regt)[4]
mm$Trop[(i-1)*length(L) +3]=coef(regt)[6]
mm$leaf[(i-1)*length(L) +3]=L[i]
featured$Dom[(i-1)*length(L) +1]=coef(regd)[3]
featured$MinMaid[(i-1)*length(L) +1]=coef(regd)[5]
featured$Trop[(i-1)*length(L) +1]=coef(regd)[7]
featured$leaf[(i-1)*length(L) +1]=L[i]
featured$Dom[(i-1)*length(L) +2]=coef(regm)[3]
featured$MinMaid[(i-1)*length(L) +2]=coef(regm)[5]
featured$Trop[(i-1)*length(L) +2]=coef(regm)[7]
featured$leaf[(i-1)*length(L) +2]=L[i]
featured$Dom[(i-1)*length(L) +3]=coef(regt)[3]
featured$MinMaid[(i-1)*length(L) +3]=coef(regt)[5]
featured$Trop[(i-1)*length(L) +3]=coef(regt)[7]
featured$leaf[(i-1)*length(L) +3]=L[i]
feature_elasticity$Dom[(i-1)*length(L) +1]=coef(regd)[8]
feature_elasticity$MinMaid[(i-1)*length(L) +1]=coef(regd)[9]
feature_elasticity$Trop[(i-1)*length(L) +1]=coef(regd)[10]
feature_elasticity$leaf[(i-1)*length(L) +1]=L[i]
feature_elasticity$Dom[(i-1)*length(L) +2]=coef(regm)[8]
feature_elasticity$MinMaid[(i-1)*length(L) +2]=coef(regm)[9]
feature_elasticity$Trop[(i-1)*length(L) +2]=coef(regm)[10]
feature_elasticity$leaf[(i-1)*length(L) +2]=L[i]
feature_elasticity$Dom[(i-1)*length(L) +3]=coef(regt)[8]
feature_elasticity$MinMaid[(i-1)*length(L) +3]=coef(regt)[9]
feature_elasticity$Trop[(i-1)*length(L) +3]=coef(regt)[10]
feature_elasticity$leaf[(i-1)*length(L) +3]=L[i]
}
View(mm)
View(featured)
View(feature_elasticity)
temp_df <- subset(temp,leaf.mean==4)
regm = lm(logmove.minute.maid~log(price.dominicks)*feat.dominicks+log(price.minute.maid)*feat.minute.maid+log(price.tropicana)*feat.tropicana, data=temp_df)
summary(regm)
#####
# Looking at the mm dataframe, the second leaf has the highest own price elasticities by far. The cross price elasticities aren't very different between the second
# and fourth leaves (e.g., low and middle wealth stores). There is almost no cross price elasticity between Dominicks and Tropicana at high end stores so no need
# to think about those products as substitutes.
#####