## Solutions to HW3
# Econ 404
#intall these packages if you don't have them
#install.packages("bayesm")
#install.packages("data.table")
library(bayesm)
library(data.table)
data("bank")
# create our data frames
choiceatt <- bank$choiceAtt
demo <- bank$demo
#merging the two dataframes by subject id
combined_df=merge(choiceatt,demo,by.x="id",by.y="id")
df1 <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee + Bank_B + Out_State + Rewrd_2 + Rewrd_3 + Rewrd_4 + Med_Rebate + High_Rebate + High_CredLine + Long_Grace, data=combined_df, family=binomial())
summary(df1)
# OLS (linear propbabilty model) version
df1 <- lm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee + Bank_B + Out_State + Rewrd_2 + Rewrd_3 + Rewrd_4 + Med_Rebate + High_Rebate + High_CredLine + Long_Grace, data=combined_df)
summary(df1)
#adjust R^2=0.25, indicating choices are pretty noisy, common in hypothetical studies
# again we see fees and interest rates have similar coefficients. in general people pay more due to interest
# rates, so this seems suboptimal. explains why we see "no fee, high interest rate" cards prevailing in market
# observations: people like lower interest rates, variable interest rates over fixed, like low fees.
# they seem to care about fees and interest rates roughly to the same degree. this probably does not make sense for people carrying large balances.
df2 <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee + Bank_B + Out_State + Rewrd_2 + Rewrd_3 + Rewrd_4 + Med_Rebate + High_Rebate + High_CredLine + Long_Grace + Low_Fee*Low_FInt, data=combined_df, family=binomial())
summary(df2)
#the coef is -0.31028 and t=-1.5. We were expecting a positive, statistically significant result. My hypothesis was NOT correct.
# we can also check to see if there is data coverage for this case
table(combined_df$Low_FInt,combined_df$Low_Fee)
table(combined_df$Low_FInt,combined_df$Low_Fee)
#Results: they are never both 1! coefficient is identified off 120 times it was -1 and 1.
# -1 0 1
#-1 0 1181 0
#0 1184 10277 724
#1 120 1313 0
# why? Conjoint software shows choices that are sufficiently different from each other. a low fee/low int rate alternative
# would "dominate" choices and was not ever shown.
# so the real answer (which is really a bonus answer) is that we don't have the right data to test this hypothesis
df3 <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee, data=combined_df, family=binomial())
summary(df3)
# we see the same pattern of results, but the coefficient estimates have moved a bit. In particular, low_fee seems to have a
#smaller effect. this tells use the choice sets probably combine low_fee with "bad attributes" on average,
# creating omitted variable bias vs. the full model
# the fit is substantially worse, which is expected as, we have omitted significantly important variables
# creating a df for subject id=1
sub1 <- subset(combined_df, id==1)
sub1_mod <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee, data=sub1, family=binomial())
summary(sub1_mod)
# we can see we run into a problem, we can't estimate all the coefficients
#Coefficients: (2 not defined because of singularities)
#Estimate Std. Error z value Pr(>|z|)
#(Intercept) 1.1580 0.6655 1.740 0.0819 .
#Low_FInt NA NA NA NA
#Med_FInt 17.4081 3765.8472 0.005 0.9963
#Med_VInt NA NA NA NA
#Low_Fee 0.9148 2.0087 0.455 0.6488
#Med_Fee 0.4574 1.4185 0.322 0.747
#let's try just the interest rates
sub1_mod <- glm (choice~Low_FInt + Med_FInt + Med_VInt, data=sub1, family=binomial())
summary(sub1_mod)
# we still see that there are NAs. this is a problem we cannot get around easily
# the summar vector has 6 entries, the last is the max
# let's create our variables to put the estimates, we'll put them in the demo data frame
demo$beta_low_fee=NA
demo$beta_med_fee=NA
demo$beta_low_int=NA
demo$beta_med_vint=NA
demo$beta_med_fint=NA
# if you did this you will notice that some ids are missing. let's solve this
demo$newid=seq(1,length(demo$id), by=1)
loop_df=merge(choiceatt,demo,by.x="id",by.y="id")
for (i in 1:length(demo$id)) {
temp_df <- subset(loop_df,newid==i)
temp_mod <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee, data=temp_df, family=binomial())
# note match the order
demo$beta_low_int[i]=coef(temp_mod)[2]
demo$beta_med_fint[i]=coef(temp_mod)[3]
demo$beta_med_vint[i]=coef(temp_mod)[4]
demo$beta_low_fee[i]=coef(temp_mod)[5]
demo$beta_med_fee[i]=coef(temp_mod)[6]
}
# now I'll show you how to access the standard errors, which was part of the bonus question
demo$se_low_fee=NA
demo$se_med_fee=NA
demo$se_low_int=NA
demo$se_med_vint=NA
demo$se_med_fint=NA
# may as well do it all in one loop
# the problem we'll have is that sometimes we estimate NA's due to thin
# in this case, R does not output a standard error, so if we try to reference
# summary$coefficients, it will return an error
for (i in 1:length(demo$id)) {
temp_df <- subset(loop_df,newid==i)
temp_mod <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee, data=temp_df, family=binomial())
mysum <- summary(temp_mod)
# note match the order
demo$beta_low_int[i]=coef(temp_mod)[2]
demo$beta_med_fint[i]=coef(temp_mod)[3]
demo$beta_med_vint[i]=coef(temp_mod)[4]
demo$beta_low_fee[i]=coef(temp_mod)[5]
demo$beta_med_fee[i]=coef(temp_mod)[6]
demo$se_low_fee[i]=tryCatch(mysum$coefficients["Low_Fee",2], error=function(e) {return(NA)})
demo$se_med_fee[i]=tryCatch(mysum$coefficients["Med_Fee",2], error=function(e) {return(NA)})
demo$se_low_int[i]=tryCatch(mysum$coefficients["Low_FInt",2], error=function(e) {return(NA)})
demo$se_med_fint[i]=tryCatch(mysum$coefficients["Med_FInt",2], error=function(e) {return(NA)})
demo$se_med_vint[i]=tryCatch(mysum$coefficients["Med_VInt",2], error=function(e) {return(NA)})
}
# now let's check out the results
summary(demo)
# the standard errors are huge, pretty much revealing the individual level regression of this complexity is too much
# what about a simpler model with just low fee & use a linear regresion
plot(demo$beta_low_fee,demo$se_low_fee)
hist(demo$beta_low_fee)
#perhaps we could try a simple linear regression
demo$se_low_fee=NA
demo$se_med_fee=NA
demo$se_low_int=NA
demo$se_med_vint=NA
demo$se_med_fint=NA
# may as well do it all in one loop
# the problem we'll have is that sometimes we estimate NA's due to thin
# in this case, R does not output a standard error, so if we try to reference
# summary$coefficients, it will return an error
for (i in 1:length(demo$id)) {
temp_df <- subset(loop_df,newid==i)
temp_mod <- lm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee + Med_Fee, data=temp_df)
mysum <- summary(temp_mod)
# note match the order
demo$beta_low_int[i]=coef(temp_mod)[2]
demo$beta_med_fint[i]=coef(temp_mod)[3]
demo$beta_med_vint[i]=coef(temp_mod)[4]
demo$beta_low_fee[i]=coef(temp_mod)[5]
demo$beta_med_fee[i]=coef(temp_mod)[6]
demo$se_low_fee[i]=tryCatch(mysum$coefficients["Low_Fee",2], error=function(e) {return(NA)})
demo$se_med_fee[i]=tryCatch(mysum$coefficients["Med_Fee",2], error=function(e) {return(NA)})
demo$se_low_int[i]=tryCatch(mysum$coefficients["Low_FInt",2], error=function(e) {return(NA)})
demo$se_med_fint[i]=tryCatch(mysum$coefficients["Med_FInt",2], error=function(e) {return(NA)})
demo$se_med_vint[i]=tryCatch(mysum$coefficients["Med_VInt",2], error=function(e) {return(NA)})
}
summary(demo)
# se's are still large relative to beta.
hist(demo$beta_low_fee)
plot(demo$beta_low_fee,demo$se_low_fee)
demo$sig_low_fee=abs(demo$beta_low_fee)>2*demo$se_low_fee
summary(demo$sig_low_fee)
# we see that 180 of the 713 coefficients are statistically siginficant at 0.05 level
# which is more than we'd expect by chance
demo$low_fee_pos=demo$beta_low_fee>0
table(demo$low_fee_pos,demo$sig_low_fee)
#positive/significant
# FALSE TRUE
#FALSE 171 15
#TRUE 362 165
# we see that of the significant ones, vast majority are positive
# conclusion --> there is evidence that some people significantly like low fees
# essentially nobody signficantly dislikes low fees (the # significant is close to what we'd expect by chance)
# let's if we can predict the significantly positive with demographics
demo$sig_pos_low_fee=demo$sig_low_fee==1 & demo$low_fee_pos==1
low_fee_mod=glm(sig_pos_low_fee~age + income + gender, data=demo,family = ("binomial"))
summary(low_fee_mod)
#consistent with what'll see below, gender is a marginal predictor, other factors don't matter
#insight: we got a little better running OLS instead of Logit and we do see significant differences at the indvidual level
# however, we cannot link these differences to the data demographics we have
#install.packages("data.table")
#library(data.table)
# now we'll do it the "data table way". this is one of the correct answers to the bonus question.
loop.dt = data.table(loop_df)
demo.2 = loop.dt[,list(betas=list(coef(glm(choice~Low_FInt + Med_FInt + Med_VInt+Low_Fee + Med_Fee, family=binomial())))),by=newid]
betas = demo.2[,list(beta_low_int=unlist(betas)[2],beta_med_fint=unlist(betas)[3],
beta_med_vint=unlist(betas)[4],beta_low_fee=unlist(betas)[5],
beta_med_fee=unlist(betas)[6]),by=newid]
demo.table=data.table(bank$demo)
#add in new id & merge to demographics
demo.table[,newid:=1:.N]
setkey(demo.table,newid)
setkey(betas,newid)
demo_betas=demo.table[betas]
# Looking at the summary means/medians
summary(demos_betas)
# making some histograms
hist(demo_betas$beta_low_int)
hist(demo_betas$beta_med_fint)
hist(demo_betas$beta_med_vint)
hist(demo_betas$beta_low_fee)
hist(demo_betas$beta_med_fee)
# low interest rate on demographics
int_mod <- lm(beta_low_int ~ age + income + gender, data=demo_betas)
summary(int_mod)
int_mod <- lm(beta_med_fint ~ age + income + gender, data=demo_betas)
summary(int_mod)
# a more general model
int_mod <- lm(beta_med_fint ~ as.factor(age) + as.factor(income) + gender, data=demo_betas)
summary(int_mod)
# demographics don't seem to matter for low interest rate
#same approach on fees
fee_mod <- lm(beta_low_fee ~ age + income + gender, data=demo_betas)
summary(fee_mod)
# a more general model
fee_mod <- lm(beta_low_fee ~ as.factor(age) + as.factor(income) + gender, data=demo_betas)
summary(fee_mod)
# a more general model yields the same conclusion
fee_mod <- lm(beta_med_fee ~ age + income + gender, data=demo_betas)
summary(fee_mod)
fee_mod <- lm(beta_med_fee ~ as.factor(age) + as.factor(income) + gender, data=demo_betas)
summary(fee_mod)
# there is evidence that men are more willing to accept fees than women
# none of the other demographics seemed to predict choice
# Bonus: another way of looking at this, take the combined data frame and use interaction terms
df1 <- glm (choice~Low_FInt + Med_FInt + Med_VInt + Low_Fee*gender + Med_Fee*gender + Bank_B + Out_State + Rewrd_2 + Rewrd_3 + Rewrd_4 + Med_Rebate + High_Rebate + High_CredLine + Long_Grace, data=combined_df, family=binomial())
summary(df1)
# as we expect, the gender interaction terms are significantly positive
# what about the hypothesis that a higher income makes you more willing to accept a variable interest rate
df1 <- glm (choice~Low_FInt + Med_FInt*income + Med_VInt*income + Low_Fee*gender + Med_Fee*gender + Bank_B + Out_State + Rewrd_2 + Rewrd_3 + Rewrd_4 + Med_Rebate + High_Rebate + High_CredLine + Long_Grace, data=combined_df, family=binomial())
summary(df1)
# higher income people are slightly more likely to accept higher interest rates, but hte differences are neglible