###                  R code for Tsinghua Workshop on R
###                             2017/6/2
###                            James Myers
###         http://www.ccunix.ccu.edu.tw/~lngmyers/TsinghuaR.htm


### 1. Basic R commands


### 1.1 You can type commands directly into R for an immediate response,
###     but for real projects it's best to write up the code/script in a separate file.
###     (You can also run the whole file using the source() function, but we won't.)


### 1.2 R functions always have the structure FUNCTION(ARGUMENTS)
###     This looks like Excel functions, except there's no = and CaPiTalizAtIon mAtTers!


### 1.3 R is a so-called object-oriented language, which means that every command
###     also creates something that you can name, do stuff with, and look inside.


### 1.4 This is true even for the simplest commands, like sum():


sum(1,3)

x # R doesn't know this variable yet

x = sum(1,3) # You can also write x <- sum(1,3) (as in many R books and web tutorials)

x # Now R knows what it is!

X # But capitalization matters!


### 1.5 What is that [1] doing there?
###     Since R is for statistics, it's designed for sets (= vectors),
###     so even a single number is treated as an element of a set (here, the 1st element)


### 1.6 We can make larger vectors with other functions and symbols:


c(1,54,3,765334,547,234,7,25498,223,435,778,97,5245,23,92,5468,15,8,246) # c = combine

5:234 # A sequence of integers; see also rep() and seq() functions

rnorm(100) # A random sample of 100 numbers from the standard normal distribution


### 1.7 Most R functions can (or must!) apply to an entire vector:


x = c(1,54,3,765334,547,234)

length(x) # Number of elements (like Excel =COUNT(), not =LEN())

mean(x) # Like Excel =AVERAGE()

mean(1,54,3,765334,547,234) # Ignores all numbers except the first one!

sqrt(x) # Some functions apply element by element

x^2

(1:10)/100000000 # You know how to read scientific notation, right?


### 1.8 We can pick out one or more elements using []


x[2]

x[2:4]


### 1.9 Character strings can also form vectors


y = c("This","tutorial","is","fun")

y

length(y) # R can count character strings too

nchar(y) # This acts like Excel's =LEN(), but it applies to each element


### 1.10 The list of filenames in your local folder is also a character vector


dir() # You can also use a character string as an argument, for a folder in your folder

myfiles = dir()

myfiles[3]


### 1.11 R makes it super easy to compute corpus word frequencies!


corpus = c("Once","upon","a","time","there","was","a","little","boy","named","Jimmy")

table(corpus) # Creates a frequency table


### 1.12 R functions may include default argument values


?rnorm # By default, mean = 0 and sd = 1, to make a standard normal distribution

x = rnorm(100,mean=100,sd=10) # Change the defaults

mean(x)

sd(x)


### 1.13 R also has logic functions (like Excel)


x = 3 # Puts the value of 3 into the variable x

x == 3 # States that x is 3

if(x==3) {y="It's three!"} else {y="It's not three!"} # Just like Excel's =IF(X,THEN_Y,ELSE_Z)

y

if(x==4) {              # This format may be easier to read (one command per line)
 y="It's three!"
} else {
 y="It's not three!"
}
y


### 1.14 R also can do loops (unlike Excel)


x = numeric(10) # Creates an empty vector

x

for (i in 1:10) {
 x[i] = sqrt(i)
}

x


### 1.15 But loops can be slow!


x = 1:100000

sqrt(x) # Quite fast

for (i in x) {
 print(sqrt(i)) # Quite slow
}


### 1.16 Subsets can also be defined by logic


x = 1:10

x[x>5]

y = c("a","b","c","d","e","f","g","h","i","j")

y[x>5]


### 1.17 A 2D arrangement is called a matrix


x = matrix(1:15,ncol=3,nrow=5)

x # The values are filled in column by column...

x[2,3] # ... but the cell indices represent row, then column! (Sorry!)

colnames(x) = c("A","B","C")

rownames(x) = c("I","II","III","IV","V")

x # Yes, Excel is a lot less confusing!


### 2. OK, let's finally look at some real data


### 2.1 We're first going to play with FakeCharExp.txt, which is the data set for
###     Myers, J. (2016). Knowing Chinese character grammar. Cognition, 147, 127-132.

###     Briefly, Chinese readers looked at fake characters and gave yes/no judgments
###     about whether they looked "like Chinese", and we recorded their choices and
###     reaction times in milliseconds (RT). The "experimental" items always contained
###     reduplicated elements arranged in three different shapes, like 品, 林, 哥.
###     In "lexical" characters, these elements also reduplicate in real characters
###     (like 口, 木, 可), while in "nonlexical" ones, the elements never reduplicate.
###     This factor was crossed with "grammaticality", whereby characters either
###     obeyed reduplication constraints (XX is OK) or violated them (XXX is usually
###     not OK). The experimental items were distributed into four Latin square
###     subsets so that nobody saw more than one item from each quartet (lex/nonlex
###     * gram/ungram). The experimental items were mixed with fake character "fillers"
###     that combined two real components (FH: filler high acceptability),
###     added/deleted a stroke (FM: filler medium acceptability), or
###     flipped a component (FL: filler low acceptability).


### 2.2 The data were colleced using the free PsychoPy program (http://www.psychopy.org/),
###     which generated separate CSV files (comma-separate values) for each subject.
###     R was used to read each file, combine them together, and rename variables,
###     but we'll skip all that to save time!


### 2.3 Usually your data are arranged in a tab-delimited table (e.g., created in Excel)
###     So the basic R data object is a data frame (a kind of fancy matrix)


chars = read.delim("FakeCharExp.txt") # Or read.table("FakeCharExp.txt",header=T)

chars # Look at the whole thing: quite long!

head(chars) # First six lines. Columns = variables, rows = individual data points

tail(chars) # Last six lines (sometimes useful to know...)


### 2.4 Let's look inside our data frame


Type # R can't "see inside" the data frame without help

ls() # Lists the objects in R's current workspace: Type isn't there

chars$Type # X$Y means that Y is a property of X

unique(chars$Type) # Removes duplicates; by default, character strings are factors


### 2.5 One confusing thing about R is that it's picky about variable types;
###     so chars$Type is treated as a factor, not as a character string vector.
###     Annoyingly, even if you take a subset of your data frame, R will still remember
###     ALL of the levels of the factors, which will cause trouble for us later.
###     You can always convert a character string into a factor with the factor() function,
###     so let's go back and load our data in again, but this time turn off this factor default,
###     and convert the strings to factors only when we need them:


chars = read.delim("FakeCharExp.txt", stringsAsFactors = F)

unique(chars$Type) # Removes duplicates; by default, character strings are factors # Now no levels


### 2.6 What's the overall mean RT...?


mean(chars$RT) # Why does it say "NA" (not available)?

is.na(chars$RT) # Since there's some missing data (the "." is part of the function name)

sum(is.na(chars$RT)) # TRUE = 1 and FALSE = 0 when you do math on logical vectors

chars.nona = na.omit(chars) # Omit rows with NA (I can use "." in naming objects too)

nrow(chars); nrow(chars.nona) # Data frame rows; semicolon for multiple commands

chars = chars.nona # We don't need those rows...

mean(chars$RT) # There we go!

sd(chars$RT)

mean(chars$RT[chars$Subj==1]) # The mean RT just for Subj 1


### 2.7 But parametric statistics (t test, ANOVA, etc) work best with normal distributions,
###     and fast RTs tend to be rightward skewed (can't be shorter than 0 ms!)


hist(chars$RT) # Indeed, the histogram looks skewed

plot(density(chars$RT)) # A smoothed histogram does too

# Try a Q-Q norm plot (quantile-quantile plot: compares against ideal normal distribution)

qqnorm(chars$RT); qqline(chars$RT) # Quite skewed!

chars$logRT = log(chars$RT) # That's base-e ("natural") log (=LN() in Excel)

head(chars) # A new column!

par(mfrow=c(1,3)) # Put next three plots in one row

# Are the log RTs less skewed? Yes!

hist(chars$logRT); plot(density(chars$logRT)); qqnorm(chars$logRT); qqline(chars$logRT)

par(mfrow=c(1,1)) # Return to default


### 3. Let's do statistics now! How about some chi-squared tests?


### 3.1 Are the number of "yes" responses the same as the number of "no" responses?
###     Let's do a one-way chi-squared test!


table(chars$Response)

chisq.test(table(chars$Response)) # Significant! More 1s than 0s


### 3.2 For the experimental trials, do lexicality and grammaticality interact
###     in judgments of acceptability? Let's do a two-way chi-squared test!


unique(chars$Type) # Remind myself how I labeled the experimental trials

# Subset of rows with this variable value (this is the part where factor levels would cause trouble)

chars.exp = subset(chars, chars$Type=="Exp")

head(chars.exp) # See the object I just made....

# Now we can use tapply() to apply sum() function to Response by list of factors

tapply(chars.exp$Response, list(chars.exp$Lexical, chars.exp$Grammatical), sum)

lexgram.table = tapply(chars.exp$Response,
 list(chars.exp$Lexical,chars.exp$Grammatical), sum) # Name this object

chisq.test(lexgram.table) # Not significant! No Lexical * Grammatical interaction


### 3.3 Let's plot these last results


# Make a bare-bones bar plot for you to check your own results:

barplot(lexgram.table, legend.text=rownames(lexgram.table), beside=T)

# Here's a prettier one that you could put in a Powerpoint:

barplot(lexgram.table, legend.text=rownames(lexgram.table), beside=T,
 ylab="Yes responses", ylim = c(0,200), col=c("red","blue"), main="My dumb results")

# But R lets you make many different kinds of plots. For example, a mosaic plot
# is more like what the two-way chi-squared test is actually testing:

mosaicplot(lexgram.table)

# Many people (but not me) like to use the ggplot2 package to make even fancier plots:

# First, we need to rearrange the data as a data frame

# The aggregate() function is like tapply() in that it applies a function F to
# a vector Y in terms of other variables X1, X2: aggregate(Y, list(X1,X2), F).
# But unlike tapply(), aggregate() also lets you write this with "formula syntax":
# Y ~ X1*X2 means "dependent variable Y varies according to the independent variables
# X1, X2, and their interaction". We will see A LOT of formula syntax from now on!

lexgram.data = aggregate(Response~Lexical*Grammatical, data=chars.exp, sum)

lexgram.data # See what we made

# Then use R's menu: Packages > Install package(s)...
# Find the "ggplot2" package, which using something called "grammar of graphics"
# After it's installed, you can load the package like so:

library(ggplot2)

# Now use ggplot2's stupid syntax to make the stupid-looking plot:

ggplot(lexgram.data, aes(x=Grammatical, y=Response, fill=Lexical)) +
 geom_bar(position="dodge", stat="identity") # aes = aesthetics; geom = shapes; etc


### 3.4 Note that chi-squared tests are NOT recommended for this particular
###     data set, since the data points are not independent! (Later we'll do
###     a fancy kind of logistic regression which takes the non-independence into account.)


### 4. How about some t tests?


### 4.1 It won't make sense to do t tests across the entire data set, since
###     we have two random variables: partipants and items. We also have
###     two kinds of responses ("yes" and "no"), which presumably involve
###     somewhat different psychological processes. So let's look at the mean
###     log RTs by subject and by item for "yes" responses only, as influenced
###     by the binary factor Lexical.


chars.exp.yes = subset(chars.exp, chars.exp$Response==1)

# Use the aggregate function to put our by-subject means into a data frame:

chars.exp.yes.subj = aggregate(logRT~Subj*Lexical, data=chars.exp.yes, mean)

head(chars.exp.yes.subj) # See?

chars.exp.yes.item = aggregate(logRT~Item*Lexical, data=chars.exp.yes, mean)

head(chars.exp.yes.item) # See?


### 4.2 Are the by-subject log RTs for "yes" responses affected by lexicality?
###     Since each subject had both lexical and nonlexical items, this is
###     is a paired t test.


t.test(logRT~Lexical, paired=T, data = chars.exp.yes.subj) # Error! Why?

# A cross-tabulation of cell sizes shows that subj 22 rejected ALL nonlexical items

xtabs(~Subj+Lexical, data = chars.exp.yes.subj) # Note the partial formula syntax!

chars.exp.yes.subj.no22 = subset(chars.exp.yes.subj, chars.exp.yes.subj$Subj != 22)

t.test(logRT~Lexical, paired=T, data = chars.exp.yes.subj.no22) # Significant

# Faster to say "yes" to lexical characters

aggregate(logRT~Lexical, data = chars.exp.yes.subj.no22, mean)

# Here's a box-and-whiskers plot of the lexical and nonlexical samples, showing medians
#  and distribution shapes

boxplot(logRT~Lexical, data = chars.exp.yes.subj.no22, ylab = "log RT")


### 4.3 What about the by-item analysis? Since items are either lexical or not,
###     this is an unpaired t test. But should we use the more powerful kind that
###     assumes equal variance (technically called a "homoscedastic t test"),
###     or the more general kind that doesn't assume equal variance (technically called
###     a "heteroscedastic t test" or "Welch's t test").


# A variance test (also know as an F test) shows a significant difference in variance:

var.test(logRT~Lexical, data = chars.exp.yes.item)

# By default, the t.test() function runs the more general but less powerful version:

t.test(logRT~Lexical, data = chars.exp.yes.item)

# The same function can run the equal-variance version too:

t.test(logRT~Lexical, var.equal=T, data = chars.exp.yes.item)


### 4.4 The two types of unpaired t tests give basically the same results anyway,
###     so if your p value is far from .05, it doesn't really matter which you use.
###     Also, of course, it doesn't make sense to do a t test just on Lexical here,
###     since we also have the crucial variable of Grammatical, and you shouldn't
###     do multiple t tests on data set, since this makes it easy for at least one of
###     them to show p < .05 just by chance!


### 5. Let's try ANOVA! This is a generalization of the t test, designed to avoid the
###    the "multiple comparisons" problem (i.e., reduces risk of Type I errors,
###    where you get p < .05 but the null hypothesis is actually true).


### 5.1 We'll start with a one-way ANOVA on the log RTs for "yes" responses to
###     the three types of fillers (FH, FM, FL).
###     Again we first have to compute the by-participant and by-item means.


chars.fill = subset(chars,chars$Type=="Filler")

chars.fill.yes = subset(chars.fill,chars.fill$Response==1)

chars.fill.yes.subj = aggregate(logRT~Subj*FillType, data=chars.fill.yes, mean)

chars.fill.yes.item = aggregate(logRT~Item*FillType, data=chars.fill.yes, mean)


### 5.2 Now let's do the one-way independent-measures ANOVA by item:


chars.fill.yes.item.aov = aov(logRT~FillType, data = chars.fill.yes.item)

summary(chars.fill.yes.item.aov) # summary() shows extra information about some objects

# Which type is faster than which...?

aggregate(logRT~FillType, data = chars.fill.yes.item, mean)

# Which pairs of filler types are different from which?
# Use a posthoc test: Tukey's Honestly Significant Difference,
# to avoid the "multiple comparisons" problem: so-called "planned comparisons" cheat...
# (Tukey is also the inventor of the box-and-whiskers plot.)

TukeyHSD(chars.fill.yes.item.aov) # Applies to an aov object, not a summary(aov) object


### 5.3 Now let's do a one-way repeated-measures ANOVA by subject on the fillers


# You indicate the random variable with the Error() argument,
# and remember to make the numerical Subj vector into a factor!
# But Subj is supposed to be a categorical. R can handle numerical random variables, 
# so if you don't factorize, you won't get any error message - just the wrong analysis!
# The fixed variable FillType is "nested within" the random variable Subj:

chars.fill.yes.subj.aov = aov(logRT~FillType + Error(as.factor(Subj)/FillType),
 data = chars.fill.yes.subj)

# Uh-oh, a warning! This time it's subject 23 who didn't ever say "yes" in one condition:

xtabs(~Subj+FillType, data = chars.fill.yes.subj)

chars.fill.yes.subj.no23 = subset(chars.fill.yes.subj, chars.fill.yes.subj$Subj!=23)

chars.fill.yes.subj.no23.aov = aov(logRT~FillType + Error(as.factor(Subj)/FillType),
 data = chars.fill.yes.subj.no23)

summary(chars.fill.yes.subj.no23.aov)


### 5.4 Sadly, TukeyHSD() function doesn't work for repeated-measures ANOVA,
###     though other R packages can do it if you really need it (I don't);
###     we'll talk about loading R packages shortly.


TukeyHSD(chars.fill.yes.subj.no23.aov) # ?TukeyHSD recommends the multcomp package.


### 5.5 Multilevel factors in repeated measures ANOVA also raises the problem of
###     "sphericity", which is like homoscedacity, i.e. the assuming of equal variance.
###     One standard correction is the Huyn-Feldt epsilon factor, which you multiply
###     your two F df values to make them lower, and thus your p value larger.
###     As with Welch's test, I wouldn't worry about it unless your p value is close
###     to .05, or if a journal reviewer makes you!


# Again use R's menu: Packages > Install package(s)...

library(ez)

# Now we can redo our repeated-measures ANOVA using the ez package's ezANOVA() function,
# which unfortunately does NOT use formula syntax:

ezANOVA(data=chars.fill.yes.subj.no23,
 dv = logRT,   # dv = "dependent variable"
 wid = Subj, # "within identifier" (automatically converted to a factor)
 within = FillType, # within-group factor (the only factor here)
 # between = ... Not relevant here, but this is how to mark between-group factors
)

# The top shows the ordinary repeated-measures regression.
# Note also "ges", for "generalized eta-squared", a measure of effect size:
#  the proportion of dependent variable variance predicted by this factor.
#  So FillType predicts about 10% of the variance in log RT

# The middle does a test for sphericity violation: it's significant!

# The bottom shows the Huynh-Feldt epsilon [HFe] and its p value
# and also the Greenhouse-Geisser epsilon [GGe] and its p value
# (it's a bad sign when even the experts can't agree on what to use!)

# To report the adjusted ANOVA results, you have to multiply your original
# df values by the HFe value:

df1 = 2
df2 = 36
HFe = 0.7315976
df1*HFe; df2*HFe

# So the report would say: F(1.46,26.34) = 11.39, p < .001

# ... but as is usually the case with paranoid statistical advice,
# all this work doesn't make any difference, since our p value was already very low!


### 5.6 In my own research, I avoid non-binary factors, so never have to worry
###     about post-hocs or sphericity. The human brain can't really understand
###     non-binary factors very well anyway. So let's look at our two binary
###     factors in the experimental items: Lexical * Grammatical


chars.exp.yes.lg.subj = aggregate(logRT~Subj*Lexical*Grammatical,
 data=chars.exp.yes, mean)

head(chars.exp.yes.lg.subj) # See?

chars.exp.yes.lg.item = aggregate(logRT~Item*Lexical*Grammatical,
 data=chars.exp.yes, mean)

head(chars.exp.yes.lg.item) # See?


### 5.7 Here's the independent-measures (between-group) two-way ANOVA on the items:


chars.exp.yes.lg.item.aov = aov(logRT~Lexical*Grammatical, data = chars.exp.yes.lg.item)

summary(chars.exp.yes.lg.item.aov) # The interaction is shown as Lexical:Grammatical

# Make a bare-bones interaction line plot will help tell us what this interaction means
# (This for us researchers to look at; for publishing, you can make it prettier
#  by changing the arguments or using the fancy ggplot2 package)

interaction.plot(chars.exp.yes.lg.item$Lexical, # Variable on x-axis
 chars.exp.yes.lg.item$Grammatical, # Variable in the legend
 chars.exp.yes.lg.item$logRT # Variable on y-axis
)

# Here's a prettier version for publication (use ?interaction.plot to learn how it works):

interaction.plot(chars.exp.yes.lg.item$Lexical, chars.exp.yes.lg.item$Grammatical,
 chars.exp.yes.lg.item$logRT, xlab="", ylab="log RT", legend=F, lwd=2)
legend("topleft",lty=c(1,2),legend=c("Gram","Ungram"))

# You can make another type of interaction plot with the "effects" package,
# and it also adds error bars representing 95% confidence intervals from your model:

library(effects) # Again, you have to install this package first

# The function I want to use needs the factors to be factor objects, not character strings:

chars.exp.yes.lg.item$Lexical = as.factor(chars.exp.yes.lg.item$Lexical)
chars.exp.yes.lg.item$Grammatical = as.factor(chars.exp.yes.lg.item$Grammatical)

plot(allEffects(chars.exp.yes.lg.item.aov)) # Here's the interaction plot!


### 5.7 Here's the repeated-measures (within-group) two-way ANOVA on the subjects.


# But the fixed and random variables aren't fully crossed again, due to several subjects:

xtabs(~Subj+Lexical+Grammatical, data = chars.exp.yes.lg.subj)

badsubjs = c(11,13,22,26,31) # I was too lazy to figure out how to do this automatically

chars.exp.yes.lg.goodsubj = subset(chars.exp.yes.lg.subj,
 !is.element(chars.exp.yes.lg.subj$Subj,badsubjs)) # ! = not

chars.exp.yes.lg.subj.aov = aov(logRT ~ Lexical * Grammatical
 + Error(as.factor(Subj)/(Lexical*Grammatical)), data = chars.exp.yes.lg.goodsubj)

summary(chars.exp.yes.lg.subj.aov)


### 6. Time for correlation and linear regression!


### 6.1 But first we need some numerical variables:


items = read.delim("FakeCharExpItems.txt")

head(items)


### 6.2 Are the character element frequencies correlated with the number of strokes?


# Both are counts, which tend to be rightward skewed. Would using logs help?

par(mfrow=c(2,2)) # A 2 x 2 arrangement of plots

qqnorm(items$ElementTypeFreq, main="Element Type Freq"); qqline(items$ElementTypeFreq)
qqnorm(items$Strokes, main="Strokes"); qqline(items$Strokes) 
qqnorm(log(items$ElementTypeFreq), main="Log Element Type Freq");
 qqline(log(items$ElementTypeFreq)) 
qqnorm(log(items$Strokes), main="Log Strokes"); qqline(log(items$Strokes))

par(mfrow=c(1,1)) # Go back to default

# Yeah, log-norming helps:

items$logStrokes = log(items$Strokes)
items$logElTypeFreq = log(items$ElementTypeFreq)

# Now we compute Pearson's correlation coefficient and its p value:

cor.test(~logStrokes+logElTypeFreq, data = items) # Negative correlation (Zipf's law!)


### 6.3 So what does this look like as a regression model?
###     It makes more real-world sense to predict strokes from element frequency:


# Let's first make a scatter plot:

plot(logStrokes~logElTypeFreq, data = items)

# Here's our linear model:

stroke.element.lm = lm(logStrokes~logElTypeFreq, data = items)

summary(stroke.element.lm) # Same p value, plus intercept and coefficients etc

# By the way, note the measure of model fit: R-squared (R2), represenging
# the proportion of independent variable variance predicted by the model,
# so log element type frequency only explains about 9% of the log number of strokes.

# We can add the line described by this regression equation to our plot
# (# "a b line", for the linear equation y = a + bx):

plot(logStrokes~logElTypeFreq, data = items); abline(stroke.element.lm)

# The vertical distances of each point from this line represent the residuals,
# as we can see by using cbind() to bind columns of residuals from residuals()
# and the differences between logStrokes and the model predictions from predict():

cbind(residuals(stroke.element.lm),items$logStrokes-predict(stroke.element.lm))

# Here are the residuals, visually as line segments:

plot(logStrokes~logElTypeFreq, data = items)
abline(stroke.element.lm, lwd=2) # lwd = line width: 2 = a litle thicker
segments(items$logElTypeFreq, items$logStrokes, items$logElTypeFreq,
 predict(stroke.element.lm), lty=2) # lty = line type: 2 means dashed

# Residuals should be normally distributed; if not, you're missing predictors

qqnorm(residuals(stroke.element.lm)); qqline(residuals(stroke.element.lm))


### 6.4 Now here's an easy way to compute standardized beta coefficients:


# You need a function to create z scores:

scale(c(1,2,3,4,5))

# Just convert all of your variables to z scores before you run the linear model,
# and the coefficients become standardized!

summary(lm(scale(logStrokes)~scale(logElTypeFreq), data = items))


### 6.5 Even though these two variables are correlated, they may have different effects
###     on reaction times. Let's find out!


# We first merge the experiment data frame (chars) and the items data frame (items),
# which both use the same variable name for the items (Item). But they also redundantly
# contain

chars = merge(chars,items,by.x="Item",by.y="Item")

head(chars) # Still more columns!

# We'll just do a by-item analysis of "yes" responses to save time...

chars.exp = subset(chars, chars$Type=="Exp") # Create this again (chars has new columns)

chars.exp.yes = subset(chars.exp, chars.exp$Response==1) # Ditto...

chars.strokefreq.item = aggregate(logRT~Item*logStrokes*logElTypeFreq,
 data = chars.exp.yes, mean)

head(chars.strokefreq.item)

# Now for our multiple linear regression:

strokefreqRT.lm = lm(logRT~logStrokes+logElTypeFreq, data = chars.strokefreq.item)

summary(strokefreqRT.lm) # Nothing does nothing! The intercept just means logRT > 0

# That's disappointing for a tutorial! Let's look at log RT for "no" responses:

chars.exp.no = subset(chars.exp, chars.exp$Response==0)

chars.strokefreq.no.item = aggregate(logRT~Item*logStrokes*logElTypeFreq,
 data = chars.exp.no, mean)

strokefreqRT.no.lm = lm(logRT~logStrokes+logElTypeFreq, data = chars.strokefreq.no.item)

summary(strokefreqRT.no.lm)

# OK, now we got something, but logStrokes isn't significant.
# Also, logElTypeFreq is just BARELY significant. Always worry if your p value
# is close to .05 (let's say between .03 and .07), since slight modifications
# in your model may change the significance - which means your effect
# may not be very reliable (and certainly not convincing to skeptics).

# Also, finding a significant effect when Response == 0 but not when Response == 1
# is not itself evidence that Response matters! This is a common fallacy in
# linguistics (and other fields), e.g. finding an effect in English but not in Chinese
# does NOT prove that language matters. To test if Response actually modulates
# the other effects, we'd need to include Response itself as a variable, and test for
# interactions with the other variables. I'll skip this step to save time, but the code
# I used to try this option is below, and it's not in fact significant.
#
# chars.strokefreq.yesno.item = aggregate(logRT~Item*Response*logStrokes*logElTypeFreq,
#  data = chars.exp, mean)
#
# chars.strokefreq.yesno.item$Resp.e =
#  2*chars.strokefreq.yesno.item$Response-1 # 0 vs. 1 => -1 vs. 1 (needed later)
#
# strokefreqRT.yesno.lm = lm(logRT~Resp.e*logStrokes+logElTypeFreq,
#  data = chars.strokefreq.yesno.item)
#
# summary(strokefreqRT.yesno.lm) # Nothing is significant!


### 6.6 We might want to be sure that the independent variables in our model
###     are not "collinear" (i.e., too confounded to distinguih).
###     A simple measure of this is the variance inflation factor (VIF).

###     VIF for predictor variables X1, X2, ... is the inverse (1/x) of the proportion
###     of the variance in each variable that is explained by the others
###     (e.g. for X1 ~ X2 + ...). So you want this number to be as low as possible,
###     with the rule of thumb being that 5 or lower is good enough.
###     (This corresponds to X1 ~ X2 + ... having R-squared over .8 (80%).)


# Instead of computing this by hand, we can install a package that has a VIF function:

library(car) # You have to install it first

vif(strokefreqRT.no.lm) # VIF is low enough, so that's not the problem...


### 6.7 Since only one of the two variables is significant in the no-response regression,
###     we can simplify the model by removing the nonsignificant variable.


strokefreqRT.nostrokes.lm = lm(logRT~logElTypeFreq, data = chars.strokefreq.no.item)

summary(strokefreqRT.nostrokes.lm) # Now logElTypeFreq is only marginally significant!

# Use anova() function to perform a likelihood ratio test (since ANOVA uses F ratios),
# to see if the simpler model is any worse than the full model:

anova(strokefreqRT.nostrokes.lm,strokefreqRT.no.lm) # Simpler vs. fuller model

# No significant difference, so simpler model is just as good
# But logElTypeFreq is only marginal here. What happens if we compare this simpler
# model with the maximally simple one, with only the intercept, represented as Y~1?

strokefreqRT.no0.lm = lm(logRT~1, data = chars.strokefreq.no.item)

anova(strokefreqRT.no0.lm,strokefreqRT.nostrokes.lm) # Same marginal p value!

# Disappointing, but there's one more trick we can try....


### 6.8 The model-comparison method has a clever application: it can help test
###     if two regression predictors are significantly different from each other.
###     To do this, we compare the model treating the predictors as separate
###     (Y ~ X1 + X1) with a model treating the predictors as if added together
###     to form a single variable (Y ~ I(X1 + X2)), using the identity function I()


strokefreqRT.no.same.lm = lm(logRT ~ I(logStrokes+logElTypeFreq),
 data = chars.strokefreq.no.item) # We don't need to look at this model, just compare it

anova(strokefreqRT.no.same.lm,strokefreqRT.no.lm) # Compare with full model: not sig

# Well, I guess we just have to remain disappointed!


### 6.10 Even if you don't use regression very often, it's conceptually central to
###     statistics, and it's crucial to upgrading from ANOVA to mixed-effects modeling,
###     which many linguists now use INSTEAD of ANOVA. So you need to know that
###     t tests and ANOVA are themselves regression. R actually implements the t.test()
###     and aov() functions by using the lm() function!


### 6.11 Let's demo this the by-item analysis for the Lexical effect on RT


head(chars.exp.yes.item) # This is the data frame we made earlier

t.test(logRT~Lexical, var.equal=T, chars.exp.yes.item) # Look at the t and p value!

summary(aov(logRT~Lexical, data = chars.exp.yes.item)) # Look at F and p! F = t^2

summary(lm(logRT~Lexical, data = chars.exp.yes.item)) # Same t and p values again!

# R uses "dummy coding" for categorical factors: Lex = 0, Nonlex = 1 (ABC order)
# Let's do this coding by hand so we can make a scatter plot:

chars.exp.yes.item$Lexical.dummy = 1*(chars.exp.yes.item$Lexical=="Nonlex") # TRUE = 1

plot(logRT~Lexical.dummy, data = chars.exp.yes.item) # 0 = Lex, 1 = Nonlex

abline(lm(logRT~Lexical.dummy, data = chars.exp.yes.item)) # See upward slope?

# Notice that this shows that ANOVA and regression also assume equal variance!


### 6.12 Two-way ANOVA is also multiple regression, but you have to use so-called "effect
###      coding", also known as "sum coding", which contrasts 1 with -1, rather than
###      with 0 (it's more complex with multilevel factors, another reason to avoid them).


# Let's look at the by-item analysis for log RT predicted from Lexical and Grammatical:

head(chars.exp.yes.lg.item) # We created this data frame earlier

summary(aov(logRT~Lexical*Grammatical, data = chars.exp.yes.lg.item)) # Reminder...

# Create effect-coded Lexical and Grammatical variables

chars.exp.yes.lg.item$Lexical.e = as.factor(chars.exp.yes.lg.item$Lexical) # New factors

chars.exp.yes.lg.item$Grammatical.e = as.factor(chars.exp.yes.lg.item$Grammatical)

contrasts(chars.exp.yes.lg.item$Lexical.e) = contr.sum(levels(chars.exp.yes.lg.item$Lexical.e))

contrasts(chars.exp.yes.lg.item$Grammatical.e) = contr.sum(levels(chars.exp.yes.lg.item$Grammatical.e))

contrasts(chars.exp.yes.lg.item$Lexical.e); contrasts(chars.exp.yes.lg.item$Grammatical.e) # See?

# Now we can run the ANOVA as a multiple regression:

summary(lm(logRT~Lexical.e*Grammatical.e, data = chars.exp.yes.lg.item))

# For the interaction, the p value is the same as for the ANOVA, and F = t^2
# The two "main effects" are still different, due to a combination of two things:
# First, our data set is not balanced (different cell sizes), due to dropped data:

xtabs(~Lexical+Grammatical, data = chars.exp.yes.lg.item)

# Second, ANOVA works by factoring out the first factor, then the second,
# and finally the interaction, so the order matters in unbalanced data sets:

summary(aov(logRT~Lexical*Grammatical, data = chars.exp.yes.lg.item)) # Original order

summary(aov(logRT~Grammatical*Lexical, data = chars.exp.yes.lg.item)) # Reversed order

# But order doesn't matter for regression, since it factors out everything at once:

summary(lm(logRT~Lexical.e*Grammatical.e, data = chars.exp.yes.lg.item))

summary(lm(logRT~Grammatical.e*Lexical.e, data = chars.exp.yes.lg.item))


### 6.13 Another possible advantage of regression over ANOVA:
###     you don't need post-hocs to compare levels to a reference level
###     (e.g., the control condition in an experiment)


head(chars.fill.yes.item) # Remember? FH = high, FM = mid, FL = low character-likeness

chars.fill.yes.item$FillType.f = as.factor(chars.fill.yes.item$FillType)

contrasts(chars.fill.yes.item$FillType.f) # FH is treated as the reference level

# Let's pretend that FM is the control condition, so we'll make it the reference level:

chars.fill.yes.item$FillType.f = relevel(chars.fill.yes.item$FillType.f,"FM")

contrasts(chars.fill.yes.item$FillType.f)

summary(aov(logRT~FillType.f, data = chars.fill.yes.item)) # This required a post-hoc

summary(lm(logRT~FillType.f, data = chars.fill.yes.item)) # FH & FL compared with FM!


### 6.14 R-squared in regression is also the same as eta-squared in ANOVA:


summary(lm(logRT~Lexical, data = chars.exp.yes.item)) # See multiple R-squared?

ezANOVA(data=chars.exp.yes.item, dv = logRT, wid = Item, between = Lexical) # See ges?


### 6.15 Reeated-measures ANOVA is also a special case of repeated-measures regression!
###      This involves running separate regressions
###      on each item or subject, which gives you a set of regression coefficients.
###      Then you run one-sample t tests on the coefficients!


# Here's a quick demo with fake data:

S = sort(rep(1:10,10)) # 1,1,1,1,1,1,1,1,1,1,2,2,2,2,...: Our "subject" IDs

x = rep(rnorm(10),10) # The same random independent variable for each subject

y = rnorm(100) # It's probably not going to be significant...

Data = data.frame(S,x,y)

Data # Look at our dumb thing

# Run a repeated-measures regression using a loop:

x.coef = numeric(10) # This will contain the x "slopes" for each subject

for (i in 1:10) { # Run a separate regression for each subject
 this.lm = lm(y~x, data = subset(Data, Data$S==i)) # Linear model just for subject i
 x.coef[i] = coef(this.lm)[2] # Second coefficient for x (first one is intercept)
}

x.coef # Here are our by-subject coefficients; now run a one-sample t test

t.test(x.coef) # Compare t, df, p with repeated-measures ANCOVA F, df2, p!

# Run a repeated-measures ANCOVA (analysis of covariance) and compare the results:

summary(aov(y~x+Error(as.factor(S)/x), data = Data))


### 6.16 By the way, what should we do with our by-item and by-subject analyses once
###      they're done? In most papers, people merely report both, and assume that 
###      if both p < .05, then they win. But this is a serious misunderstanding
###      of the original paper that recommended running by-item analyses
###      (Google "language-as-fixed-effect fallacy"). The proper ANOVA strategy
###      is to combine the by-item F and by-subject F into a SINGLE F value,
###      called minF'. It is quite possible for p < .05 for both items and subjects,
###      but minF' p > .05!

###      Almost nobody uses minF' (though I was required to do so once by a journal).
###      Fortunately, mixed-effects modeling is a generalization of repeated-measures
###      regression that allows you to test more than one random variable at a time!


### 6.17 So ANOVA is just a special case of regression, but "real" regression can also
###      handle numerical variables, doesn't require balanced data, gives the same
###      results regardless of factor order, can compute standardized effect sizes,
###      need not require post-hoc tests, doesn't require minF'....
###      So why not just use regression INSTEAD of ANOVA?


### 7. Logistic regression is a generalization of linear regression.


### 7.1 It lets you analyze a categorical dependent variable,
###    particularly binary data, like accuracy (right vs. wrong), or response choice
###    (yes vs. no, word vs. nonword), or presence of a sociolinguistic property
###    (present vs. absent): VARBRUL software uses logistic regression
###    (and there's an R package that imitates it, humorously called Rbrul:
###    http://www.danielezrajohnson.com/rbrul.html)

###    So instead of converting binary values into a continuous value, like
###    accuracy rate (#correct/#total) and doing an ANOVA, it's better to
###    analyze the binary data as binary. You can even use the one-sample t test
###    trick on the coefficients to run repeated-measures logistic regression.


### 7.2 The name "logistic" refers to the mathematical function used to 
###     convert the dependent variable into something that you could plot with
###     a straight line, namely the logit (log odds) function.


5/23  # The probability of getting 5 "yes"s in 23 trials

log(5/(23-5)) # The log odds of the same thing

# Log odds can range from -infinity to +infinity, so an infinite line can fit them:

log(0/23); log(23/0)


### 7.3 Cleverly, the log links probability to a linear equation (a sum of factors),
###     because if probabilities are independent, you get their joint probability
###     by multiplying, and the log of multiplication is addition (sum)...


### 7.4 Because the probabilities involve binary data, logistic regression uses
###     the binomial distribution (the coin-flip distribution), but this
###     distribution becomes more and more normal in larger sample sizes,
###     so the algorithm still lets you compute p values with a version of the
###     z test (here called the "Wald test")


### 7.5 Anyway, let's use logistic regression to see how Lexical and Grammatical
###     in the experimental trials affect the choice of "yes" vs. "no"
###     (this is similar to the chi-squared tests we did earlier, and it
###     has a similar problem, because we're ignoring the non-independence of the data,
###     but we'll deal with this issue later)


head(chars.exp) # We'll just look at the experimental trials

# Use effect or sum coding so the interaction is easier to interpret

chars.exp$Lexical.e = as.factor(chars.exp$Lexical)

chars.exp$Grammatical.e = as.factor(chars.exp$Grammatical)

contrasts(chars.exp$Lexical.e) = contr.sum(levels(chars.exp$Lexical.e))

contrasts(chars.exp$Grammatical.e) = contr.sum(levels(chars.exp$Grammatical.e))

# Use the glm() function for "generalized linear model",
# and use the "binomial" family of distributions:

lexgram.glm = glm(Response~Lexical.e*Grammatical.e, family="binomial", data = chars.exp)

summary(lexgram.glm)

# The intercept represents the overall probability of "yes" (acceptance);
# here it's positive, so there are more acceptances than rejections:

table(chars.exp$Response)

# The signs of the other coefficients show whether the factor increases or decreases
# the probability of responding with "yes". So all together, this analysis
# shows that Lexical and Grammatical both improve acceptability, but they don't interact

# Note the z values: these p values are based on the Wald test, which requires large
# sample sizes (a common rule of thumb: at least 10 data points per independent variable,
# including interactions).


### 7.6 Unlike ordinary linear regression, logistic regression models can only
###     be estimated by starting with plausible coefficients, testing the model fit,
###     adjusting the coefficient values and testing the model fit again,
###     looping and looping until either the model fit doesn't improve any more
###     or you run out of the pre-set number of loops.

###     Thus it's quite possible for this process to take some time, or even crash!
###     Ironically, one type of case where logistic regression crashes is when
###     the independent variable predicts the dependent variable TOO well!

x = 1:100

y = c(rep(0,50),rep(1,50)) # So y changes from 0 to 1 when x > 50

plot(y~x) # x predicts y perfectly!

glm(y~x, family="binomial") # Crash!

# In technical terms, the algorithm "did not converge" on a best-fitting model.


### 7.7 Speaking of fit, how can you measure how well a model fits the data?
###     As we saw, for linear regression, R^2 (R-squared) measures the proportion of
###     variance in the dependent variable that's predicted by the independent
###     variables. For ANOVA, eta-squared is exactly the same thing!

###     Neither has a direct equivalent in logistic regression, but the fit for ALL
###     regression models can be quantified as "AIC" (Akaike Information Criterion),
###     which is SMALLER (more negative) for better-fitting models. The AIC
###     value is universal, so you can use it to compare any model with any other.


AIC(lexgram.glm)

AIC(stroke.element.lm) # This model is better (for a completely different data set)


### 7.8 Of course logistic regression works with numerical variables as well:


elementstrokes.glm = glm(Response~logElTypeFreq+logStrokes,
 family="binomial", data = chars.exp)

summary(elementstrokes.glm) # Lovely significance!

# Note the AIC value, which is derived from deviance, which is a measure of how
# different the model is from the raw data. If you've studied statistics, you
# might remember the term "deviance" used in computing the chi-squared test too.


### 7.9 Both of our independent variables are both significant, but are they
###     different from each other?


# We can look at their effect sizes by turning them to z scores, as we did for
# ordinary linear regression. However, because the dependent variable is categorical,
# we can't do the same thing to the dependent variable, so the resulting coefficients
# aren't truly standardized (i.e., we can't compare them across models):

elementstrokes.z.glm = glm(Response ~ scale(logElTypeFreq) + scale(logStrokes),
 family="binomial", data = chars.exp)

summary(elementstrokes.z.glm) # The coefficient for logElTypeFreq has a great magnitude

# To see if these two effects are significantly different from each other, as before
# we can compare the full model with a simpler one adding the two variables with I(),
# using anova(). The only difference is that for categorical data the likelihood
# ratio test is not analysis of variance but "analysis of deviance", so to get
# p values we need a chi-squared test.

elementstrokes.z.same.glm = glm(Response ~ I(scale(logElTypeFreq) + scale(logStrokes)),
 family="binomial", data = chars.exp)

anova(elementstrokes.z.same.glm, elementstrokes.z.glm, test="Chisq") # Significant!


### 7.10 Logistic regression models have no standard plotting style, but one
###      approach is to take the generalized "linear" concept literally,
###      using the logit function to transform the data so that the best-fit line
###      is a straight line. The easiest way to do this is with the effects package,
###      which rescales the y-axis back into probabilities (from log odds)


# Here's what a model with categorical predictors looks like,
# with 95% confidence intervals:

plot(allEffects(lexgram.glm))

# And here's what a model with numerical predictors looks like,
# with 95% confidence bands (note the heteroscedasity):

plot(allEffects(elementstrokes.z.glm))


### 8. The looping trick used to fit logistic regression, combined with increased
###    computer power, has allowed the invention of many new types of generalized
###    regression models in the past few decades. The most widely used is
###    mixed-effects modeling


### 8.1 Mixed-effects models are like repeated-measures regression, except that
###     instead of running by-subject (or by-item) regressions in one step and
###     cross-subject (or cross-item) one-sample t tests in a second step,
###     it does both of these things at the same time, using looping to improve
###     the fit given by ALL of the coefficients at the same time:
###     not just for the fixed variables (intercept, effects, interactions),
###     but also for the random variables.

###     Random intercepts represent the basic effect of each random unit,
###     like the mean for each subject. This allows the model to automatically
###     factor out by-subject variation, like slower vs. faster responders
###     (or higher-pitched vs. lower-pitched speakers, etc).

###     Random slopes represent the interaction between random units and the fixed
###     effects. For example, maybe Subject 1 shows a positive effect of Lexical,
###     Subject 2 shows a negative effect, and Subject 3 shows no effect.
###     This allows the model to automatically factor out things like random variation
###     in response range (e.g., in a Likert-scale task, subjects who use the whole
###     scale vs. those who use only part of it).


### 8.2 The greater complexity of mixed-effects models, however, means that they
###     are even more likely to crash than logistic regression. The crashing often
###     happens because the model is trying to capture random effects that don't
###     really matter.

###     Thus while it's bad to test ONLY the simplest models (Type I errors),
###     it's wise to try to find the simplest model that is "good enough"
###     (i.e., where anova(simplemodel, complexmodel) is not significant).

###     For technical discussion of these issues, the crucial papers are these:

###     Barr, D. J., Levy, R., Scheepers, C., & Tily, H. J. (2013). Random effects
###        structure for confirmatory hypothesis testing: Keep it maximal.
###        Journal of Memory and Language, 68 (3), 255-278.

###     Matuschek, H., Kliegl, R., Vasishth, S., Baayen, H., & Bates, D. (2017).
###        Balancing Type I error and power in linear mixed models.
###        Journal of Memory and Language, 94, 305-315.


### 8.3 Let's try it, starting with log RT for "yes" responses. Because mixed-effects
###     modeling can handle two random variables at the same time, we can do this
###     on the "raw" log RTs, without having to first compute by-subject or by-item
###     means!


# First let's create a data frame for all "yes" responses to experimental items:

chars.exp.yes = subset(chars.exp, chars.exp$Response==1)

# Now, we need the package, though it may have already been installed
# by the "ez" and "effects" packages, which we were using above:

library(lme4) # For linear-mixed effects in S4 style (version 4 of S program)

# For linear mixed-effects modeling (with normally distributed dependent variable)
# the function is lmer() (LME in R).

# We'll first look at Lexical.e and Grammatical.e (effect-coded categorical factors).
# The random parts of the model use the syntax +(factors|random_unit)
# (cf. the repeated-measures syntax for aov(): +Error(as.factor(random_unit)/factors))
# The lmer() function automatically treats the random variable as categorical.

# Both of our fixed factors are "within" subject but "between" items,
# so in the random part of the model we need to reflect this structure:
# This means that subjects have both random intercepts and random slopes
# for both effects and their interaction, but items ONLY have random intercepts.
# (Lexical.e*Grammatical.e|Subj) represents the random intercepts and slopes
# for subjects, and (1|Item) represent the random intercepts
# (recall Y~1 syntax for intercept-only models)

# This is a so-called "maximal model", as recommended by Barr et al. (2013):

lexgram.lmer1 = lmer(logRT ~ Lexical.e * Grammatical.e # Fixed part of model
 + (Lexical.e*Grammatical.e|Subj) + (1|Item), # Random part of model
 data = chars.exp.yes)

# Warning: Model "failed to converge"! I told you mixed-effects models crash a lot.

# Following Matuschek et al. (2017), we can try to simplify the random part of the
# model. The maximal model actually includes not just random intercepts and
# random slopes, but also an interaction between these two. We can ignore this
# interaction if we split (Lexical.e*Grammatical.e|Subj) into
# (1|Subj) (intercept only) plus (0+Lexical.e*Grammatical.e|Subj) (slopes only).

# (Y ~ 0 + X1 + ... is the formula syntax for a regression model without an intercept)

lexgram.lmer2 = lmer(logRT ~ Lexical.e * Grammatical.e 
 + (1|Subj) + (0+Lexical.e*Grammatical.e|Subj) + (1|Item), data = chars.exp.yes)

# Still too complex. Now let's remove the interaction inside the random slopes:

lexgram.lmer3 = lmer(logRT ~ Lexical.e * Grammatical.e 
 + (1|Subj) + (0+Lexical.e+Grammatical.e|Subj) + (1|Item), data = chars.exp.yes)

# Aha! It converges! Let's look at the results:

summary(lexgram.lmer3)


### 8.4 Hey! Where are the p values? Welcome to yet another controversy about
###     linear mixed-effects modeling: the experts disagree on what df to use,
###     so they aren't sure how to estimate the p values from the t values.

###     The simplest response to this nonsense is to treat t as if it were z,
###     since t distributions turn into the normal distribution as the sample
###     size increases. So if you have many data points, t > 2 or so is pretty
###     much guaranteed to have p < .05, no matter how df is precisely calculated.


# We do indeed have a pretty large sample, so it looks like we have significant
# main effect of Lexical.e and a significant interaction with Grammatical.e,
# but no main effect of Grammatical.e.

nrow(chars.exp.yes)


### 8.5 If you want actual p values, you could compare your full model with one
###     missing just one model parameter. This approach makes the most sense
###     for an additive model (without any interaction), or if you're testing
###     only the interaction, since a model like Y~X1+X1:X2 (an interaction
###     but missing one of the factors composing it) doesn't seem to make sense.


lexgram.noint.lmer3 = lmer(logRT ~ Lexical.e + Grammatical.e 
 + (1|Subj) + (0+Lexical.e+Grammatical.e|Subj) + (1|Item), data = chars.exp.yes)

anova(lexgram.lmer3,lexgram.noint.lmer3) # Get p value for interaction


### 8.6 A general solution is to install the "afex" package (pun for "effects"),
###     which gives you many p value options in its function mixed().
###     By default this gets p values using something called the Kenward-Roger
###     approximation:


library(afex) # Annoyingly, it changes the way lme4 functions work, but not too much...

lexgram.lmer3.mixed = mixed(logRT ~ Lexical.e * Grammatical.e 
 + (1|Subj) + (0+Lexical.e+Grammatical.e|Subj) + (1|Item), data = chars.exp.yes)

lexgram.lmer3.mixed # Also annoyingly, summary() doesn't work; just type in model name

# See? The t = z method is probably good enough, especially if |t| >> 2


### 8.7 Let's see if LME can detect any effect of log element frequency
###     and log stroke number on logRT (unlikely, since LME p values tend to
###     be higher, since they recognize that the data points are not all
###     independent).


# Again, logStrokes and logElTypeFreq are both "within" subject but "between" item,
# and we can run this on the same data frame we just used.
# And - lucky us! - the maximal model converges!

strokefreqRT.mixed = mixed(logRT~logStrokes+logElTypeFreq
 + (logStrokes+logElTypeFreq|Subj) + (1|Item), data = chars.exp.yes)

strokefreqRT.mixed # Still nonsignificant


### 8.8 Mixed-effects modeling applies to binary data too, in something called
###     mixed-effects logistic regression, using the glmer() function.

###     Let's see how lexicality and grammaticality affects yes/no responses
###     in the experimental trials. Recall that our ordinary logistic regression
###     model on this (lexgram.glm) ignored the fact that responses were correlated
###     within subject and item. But now we can take these random effects into account!


# Once again, the two key variables are "within" subjects but "between" items.
# Be patient - generalized linear mixed-effects models can take a while to build:

lexgram.glmer1 = glmer(Response~Lexical.e*Grammatical.e
 + (Lexical.e*Grammatical.e|Subj) + (1|Item), family="binomial", data = chars.exp)

# No surprise: it fails to converge. Let's simplify the random effects:

lexgram.glmer2 = glmer(Response~Lexical.e*Grammatical.e
 + (1|Subj) + (0+Lexical.e*Grammatical.e|Subj) + (1|Item), family="binomial",
 data = chars.exp)

# Not simple enough. Here's our next try:

lexgram.glmer3 = glmer(Response~Lexical.e*Grammatical.e
 + (1|Subj) + (0+Lexical.e+Grammatical.e|Subj) + (1|Item), family="binomial",
 data = chars.exp)

# Still not simple enough. Now let's try getting rid of random slopes entirely:

lexgram.glmer4 = glmer(Response~Lexical.e*Grammatical.e
 + (1|Subj) + (1|Item), family="binomial", data = chars.exp)

# That converges! Hopefully the p values aren't too close to .05, since
# our simple model risks Type II errors:

summary(lexgram.glmer4) # No worries! p < .0001 for both of the main effects

# Hm, could we make the model even simpler? After all, the experimental design
# matched all of the item-based variables, so they should cancel out,
# possibly making it unnecessary to treat Item as a random variable:

lexgram.glmer5 = glmer(Response~Lexical.e*Grammatical.e
 + (1|Subj), family="binomial", data = chars.exp)

# Let's see if this model is TOO simple:

anova(lexgram.glmer5, lexgram.glmer4, test="Chisq") # Yup, it has significantly worse fit


### 8.9 Of course mixed-effects models can also handle numerical independent variables:


elementstrokes.glmer1 = glmer(Response~logElTypeFreq+logStrokes
 + (logElTypeFreq+logStrokes|Subj) + (1|Item), family="binomial", data = chars.exp)

# Didn't converge. Let's simplify:

elementstrokes.glmer2 = glmer(Response~logElTypeFreq+logStrokes
 + (1|Subj) + (0+logElTypeFreq+logStrokes|Subj)
 + (1|Item), family="binomial", data = chars.exp)

# Didn't converge again. Let's simplify again:

elementstrokes.glmer3 = glmer(Response~logElTypeFreq+logStrokes
 + (1|Subj) + (1|Item), family="binomial", data = chars.exp)

# Converged! Here's the results:

summary(elementstrokes.glmer3) # p values are far from .05, so no worries

# Let's plot our model (yes, the effects package works for mixed-effects models)

plot(allEffects(elementstrokes.glmer3)) # See wide band for nonsig logStrokes?


### 9. Finally, let's play with Prof. Hsu's data....
###    I don't entirely understand her data set, but I'll start with
###    some guesses about what she might want to do.

###    This is self-paced reading data on English sentences.
###    There are several "regions of interest", i.e. particular syntactic positions
###    in the sentences, where she cares about the looking time (RT in milliseconds)
###    as a function of two cross binary factors (Reference x Ambiguity),
###    though in the raw file they're coded as a single four-level factor

###    Apparently she usually does separate by-subject and by-item ANOVAS
###    for each region of interest, so we'll try that first.

###    But then we'll try to go further with mixed-effects modeling
###    (in a real paper, we would only take ONE of these approaches,
###    since ANOVA is just a special case of mixed-effects modeling, so doing
###    both would be redundant).


## First create the data file:
## - Copy/paste the contents of the Excel file sheet "Data" into "HsuData.txt"
## - Then load it in:

spr = read.delim("HsuData.txt") # spr for self-paced reading...

head(spr)

# It's already arranged in an R-friendly way, as is usually
# the case for experimental control programs (E-Prime, etc)

# I guess these are the crucial variables:

# COND: Actually two crossed binary factors:
#  a = 1-referent, ambiguous
#  b = 1-referent, unambiguous
#  c = 2-referent, ambiguous
#  d = 2-referent, unambiguous
# ITEM: Item ID number (sentence)
# SUBJ: Subject ID number
# SBIN: Order of sentence presentation (randomly different for each subject)
# WNUM: Order of words in this particular sentence
# WORD: The actual English word shown (including punctuation)
# RNUM: Region of interest (0 = ignored; 1...11 = interesting)
# RWRT: Looking time for this word (in milliseconds)

## First let's check if the reading times are normally distributed:

qqnorm(spr$RWRT); qqline(spr$RWRT) # Nope.

# Let's try lognorming them:

spr$logRWRT = log(spr$RWRT)

qqnorm(spr$logRWRT); qqline(spr$logRWRT) # Somewhat better, but not great

hist(spr$logRWRT) # Seems to be a few super-slow responses

# What if we treat all log RTs more than 3 SD from overall mean as "outliers"?

spr.3sd = subset(spr, abs(scale(spr$logRWRT)) < 3)

qqnorm(spr.3sd$logRWRT); qqline(spr.3sd$logRWRT) # Better - let's say good enough

hist(spr.3sd$logRWRT) # Yeah... we'll live with this

# How much data did we throw out?

nrow(spr)-nrow(spr.3sd)

(nrow(spr)-nrow(spr.3sd))/nrow(spr) # Only about 2%

# You can try running all of the following analyses on the full data set,
# and/or on the raw RTs - maybe the significance patterns will end up basically
# the same. But I'm trying to "do it right"....

## Now let's convert COND into two crossed binary factors REF and AMB:

spr.3sd$REF = "One" # Default
spr.3sd$REF[spr.3sd$COND == "c" | spr.3sd$COND == "d"] = "Two" # | means OR

spr.3sd$AMB= "Ambig" # Default
spr.3sd$AMB[spr.3sd$COND == "b" | spr.3sd$COND == "d"] = "Unambig"

spr.3sd # Confirm that we did it right

## OK, let's compute the by-subject means for our ANOVAs, arranged so that
## we can do analyze RWRT ~ REF * AMB for each region, which is our main goal (?)

spr.subj = aggregate(logRWRT ~ SUBJ * RNUM * REF * AMB, data = spr.3sd, mean)

head(spr.subj) # Looks OK

# It seems that REF and AMB are within subject:

xtabs(~ SUBJ + REF + AMB, data = spr.subj)

# Here's a repeated-measures ANOVA for Region 1:

summary(aov(logRWRT ~ REF * AMB + Error(as.factor(SUBJ)/(REF * AMB)),
 data = subset(spr.subj, spr.subj$RNUM == 1)))

# We can actually analyze all of the regions one by one, in a loop,
# but first let's confirm that we have all REF * AMB combos for all regions:

xtabs(~REF + AMB + RNUM, data = spr.subj)

# Nope, regions 3 & 4 are always unambiguous, so we'll analyze them later...

for (i in c(1:2,5:11)) { # Loops don't display results by default, so use print()

 print(i) # So we know which region we're analyzing...

 print(

  summary(aov(logRWRT ~ REF * AMB + Error(as.factor(SUBJ)/(REF * AMB)),
   data = subset(spr.subj, spr.subj$RNUM == i)))

 )

}

# And here are regions 3 & 4:

for (i in 3:4) { # Only look at REF (so basically paired t tests)

 print(i)

 print(

  summary(aov(logRWRT ~ REF + Error(as.factor(SUBJ)/REF),
   data = subset(spr.subj, spr.subj$RNUM == i)))

 )

}

## Now for the by-item analyses.

spr.item = aggregate(logRWRT ~ ITEM * RNUM * REF * AMB, data = spr.3sd, mean)

head(spr.item)

# It seems that REF and AMB are also within item:

xtabs(~ ITEM + REF + AMB, data = spr.item)

# Let's do the ANOVAs in one loop, using if() to switch models for regions 3 & 4:

for (i in 1:11) {

 print(i)

 if (is.element(i,c(3,4))) { # If i is in the set {3,4}

  print(
   summary(aov(logRWRT ~ REF + Error(as.factor(ITEM)/REF),
    data = subset(spr.item, spr.item$RNUM == i)))
  )

 } else { # Crucially, "else" must appear on same line as } ...

  print(
   summary(aov(logRWRT ~ REF * AMB + Error(as.factor(ITEM)/(REF * AMB)),
    data = subset(spr.item, spr.item$RNUM == i)))
  )

 } # End of if...else...

}

## But... are the RTs really independent across the regions?
## And if we find that factor X p < .05 for region A but p > .06 for region B,
## this does NOT mean that the two regions are different for factor X
## (see above discussion...)
## Better would be to include region as a numerical covariate (since it's
## naturally ordered): a main effect of region would imply a linear change
## over the course of the sentence, and interactions with REF or AMB
## would imply linearly increasing or decreasing REF or AMB effects over
## the course of the sentence.

# Do by-subjects and by-items analyses in repeated-measures ANCOVA models
# (I'll drop regions 3 & 4, so we can use the same REF * AMB model throughout):

summary(aov(logRWRT ~ RNUM * REF * AMB + Error(as.factor(SUBJ)/(RNUM * REF * AMB)),
    data = subset(spr.subj, !is.element(spr.subj$RNUM, c(0,3,4))))) # ! = not

summary(aov(logRWRT ~ RNUM * REF * AMB + Error(as.factor(ITEM)/(RNUM * REF * AMB)),
    data = subset(spr.item, !is.element(spr.item$RNUM, c(0,3,4))))) # ! = not

# Only an interaction between RNUM and REF.
# Does this pattern make any sense? Hard to tell....

ggplot(spr.subj, aes(x=RNUM, y=logRWRT, fill=REF)) +
 geom_bar(position="dodge", stat="identity") # aes = aesthetics; geom = shapes; etc

# I guess Prof. Hsu would rather test for interactions in REF & AMB for specific
# pairs or subsets of regions that have particular theoretical interest;
# treating RNUM as a number implies a sentence-wide linear effect that doesn't
# make linguistic sense....

## Let's redo the main analyses as by-subject-and-item mixed-effects models,
## applied to the "raw" log RTs in spr.3sd, not the means.

# Since we want to test for the REF * AMB interaction in a regression model,
# the results will be easier to interpret if we use effect coding (-1 vs. 1):

spr.3sd$REF.e = factor(spr.3sd$REF) # Initialize factor

contrasts(spr.3sd$REF.e) = contr.sum(levels(spr.3sd$REF.e))

spr.3sd$AMB.e = factor(spr.3sd$AMB) # Initialize factor

contrasts(spr.3sd$AMB.e) = contr.sum(levels(spr.3sd$AMB.e))

## To save time, we'll just look at region 1, using mixed() in afex
## to get p values - this is a complex model, so it's quite slow...

reg1.lmer1 = mixed(logRWRT ~ REF.e * AMB.e
 + (REF.e * AMB.e | SUBJ) + (REF.e * AMB.e | ITEM),
   data = subset(spr.3sd, spr.3sd$RNUM == 1))

# Didn't converge. Let's simplify by a minimal amount:

reg1.lmer2 = mixed(logRWRT ~ REF.e * AMB.e
 + (REF.e * AMB.e | SUBJ)
 + (1 | ITEM) + (0 + REF.e * AMB.e | ITEM),
   data = subset(spr.3sd, spr.3sd$RNUM == 1))

# Yay! It converged! Here's the results:

reg1.lmer2 # With p values

summary(reg1.lmer2) # lmer() output, with other regression information

# Compare with by-subject and by-item ANOVA;
# note that the p value for the by-subject-and-item mixed-effects model is higher:
# this is typical (remember minF'?)

summary(aov(logRWRT ~ REF * AMB + Error(as.factor(SUBJ)/(REF * AMB)),
 data = subset(spr.subj, spr.subj$RNUM == 1)))

summary(aov(logRWRT ~ REF * AMB + Error(as.factor(ITEM)/(REF * AMB)),
 data = subset(spr.item, spr.item$RNUM == 1)))

## But we have other variables that might affect reading times:
## SBIN (order of sentence presentation: may reveal practice or fatigue),
## WORD (actual words: does word length matter?)

# Let's add a variable WLEN for word length:

spr.3sd$WLEN = nchar(as.character(spr.3sd$WORD)) # WORD loaded as factor by default

head(spr.3sd)

# Just for fun, let's see how these "nuisance" variables affect reading times.
# The LME algorithm converges better if the independent variables are all on
# the same scale, so let's use the z score trick, which will also give us
# standardized beta coefficients representing effect sizes:

spr.3sd$SBIN.z = scale(spr.3sd$SBIN)

spr.3sd$WLEN.z = scale(spr.3sd$WLEN)

spr.3sd$logRWRT.z = scale(spr.3sd$logRWRT)

# Just look at region 2 (because region 1 always has the same word "the")

reg1.sbin.wlen.lmer1 = mixed(logRWRT.z ~ SBIN.z + WLEN.z
 + (SBIN.z + WLEN.z | SUBJ)
 + (SBIN.z + WLEN.z | ITEM),
   data = subset(spr.3sd, spr.3sd$RNUM == 2))

# It converges on our first try! Here's the results:

reg1.sbin.wlen.lmer1 # p values

summary(reg1.sbin.wlen.lmer1) # Coefficient signs and magnitudes are useful

# SBIN effect on RT is negative: with practice, people read faster
# WLEN effect on RT is positive: longer words take longer to read