sd
function:
sd(
0
:100
)
demo(plotmath)
and hit Enter, or click the plots to see what’s on offer.
all.equal
is usually what you need:
x==
z identical(
x,
z)
all.equal(
x,
z)
all.equal(
x,
z,
tolerance=
0
)
true_and_missing<-
c(
NA
,
TRUE
,
NA
)
false_and_missing<-
c(
FALSE
,
FALSE
,
NA
)
mixed<-
c(
TRUE
,
FALSE
,
NA
)
any(
true_and_missing)
any(
false_and_missing)
any(
mixed)
all(
true_and_missing)
all(
false_and_missing)
all(
mixed)
class(
Inf)
class(
NA
)
class(
NaN
)
class(
""
)
Repeat with typeof
, mode
, and storage.mode
.
pets<-
factor(
sample(
c(
"dog"
,
"cat"
,
"hamster"
,
"goldfish"
),
1000
,
replace=
TRUE
))
head(
pets)
summary(
pets)
Converting to factors is recommended but not compulsory.
carrot<-
1
potato<-
2
swede<-
3
ls(
pattern=
"a"
)
Your vegetables may vary.
seq_len
and seq_along
:
n<-
seq_len(
20
)
triangular<-
n*
(
n+
1
)
/
2
names(
triangular)
<-
letters[
seq_along(
n)]
triangular[
c(
"a"
,
"e"
,
"i"
,
"o"
)]
abs
gives you the absolute value of a number:
diag(
abs(
seq.int(
-11
,
11
)))
identity_20_by_21<-
diag(
rep.int(
1
,
20
),
20
,
21
)
below_the_diagonal<-
rbind(
0
,
identity_20_by_21)
identity_21_by_20<-
diag(
rep.int(
1
,
20
),
21
,
20
)
above_the_diagonal<-
cbind(
0
,
identity_21_by_20)
on_the_diagonal<-
diag(
abs(
seq.int(
-10
,
10
)))
wilkinson_21<-
below_the_diagonal+
above_the_diagonal+
on_the_diagonal eigen(
wilkinson_21)$
values
For simplicity, here I’ve manually specified which numbers are square. Can you think of a way to automatically determine if a number is square?
list(
"0 to 9"
=
c(
0
,
1
,
4
,
9
),
"10 to 19"
=
16
,
"20 to 29"
=
25
,
"30 to 39"
=
36
,
"40 to 49"
=
49
,
"50 to 59"
=
NULL
,
"60 to 69"
=
64
,
"70 to 79"
=
NULL
,
"80 to 89"
=
81
,
"90 to 99"
=
NULL
)
More fancily, we can automate the calculation for square numbers. Here, the cut
function creates different groups with the ranges 0 to 9, 10 to 19, and so on, then returns a vector stating which group each square number is in. Then the split
function splits the square number vector into a list, with each element containing the values from the corresponding group:
x<-
0
:99
sqrt_x<-
sqrt(
x)
is_square_number<-
sqrt_x==
floor(
sqrt_x)
square_numbers<-
x[
is_square_number]
groups<-
cut(
square_numbers,
seq.int(
min(
x),
max(
x),
10
),
include.lowest=
TRUE
,
right=
FALSE
)
split(
square_numbers,
groups)
iris
dataset. Experiment with them!
iris_numeric<-
iris[,
1
:4
]
colMeans(
iris_numeric)
beaver1$
id<-
1
beaver2$
id<-
2
both_beavers<-
rbind(
beaver1,
beaver2)
subset(
both_beavers,
as.logical(
activ))
new.env
. After that, the syntax is the same as with lists:
multiples_of_pi<-
new.env()
multiples_of_pi[[
"two_pi"
]]
<-
2
*
pi multiples_of_pi$
three_pi<-
3
*
pi assign(
"four_pi"
,
4
*
pi,
multiples_of_pi)
ls(
multiples_of_pi)
## [1] "four_pi" "three_pi" "two_pi"
is_even<-
function
(
x)
(
x%%
2
)
==
0
is_even(
c(
-5
:5
,
Inf,
-
Inf,
NA
,
NaN
))
## [1] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE ## [12] NA NA NA NA
formals
and body
functions do the hard work, and we just need to return a list of their results:
args_and_body<-
function
(
fn)
{
list(
arguments=
formals(
fn),
body=
body(
fn))
}
args_and_body(
var)
## $arguments ## $arguments$x ## ## ## $arguments$y ## NULL ## ## $arguments$na.rm ## [1] FALSE ## ## $arguments$use ## ## ## ## $body ## { ## if (missing(use)) ## use <- if (na.rm) ## "na.or.complete" ## else "everything" ## na.method <- pmatch(use, c("all.obs", "complete.obs", ## "pairwise.complete.obs", "everything", "na.or.complete")) ## if (is.na(na.method)) ## stop("invalid 'use' argument") ## if (is.data.frame(x)) ## x <- as.matrix(x) ## else stopifnot(is.atomic(x)) ## if (is.data.frame(y)) ## y <- as.matrix(y) ## else stopifnot(is.atomic(y)) ## .Call(C_cov, x, y, na.method, FALSE) ## }
args_and_body(
alarm)
## $arguments ## NULL ## ## $body ## { ## cat("\a") ## flush.console() ## }
#split on an optional comma, a compulsory space, an optional
#hyphen, and an optional space
strsplit(
x,
",? -? ?"
)
## [[1]] ## [1] "Swan" "swam" "over" "the" "pond" "Swim" "swan" "swim!" ## ## [[2]] ## [1] "Swan" "swam" "back" "again" "Well" "swum" "swan!"
#or, grouping the final hyphen and space
strsplit(
x,
",? (- )?"
)
## [[1]] ## [1] "Swan" "swam" "over" "the" "pond" "Swim" "swan" "swim!" ## ## [[2]] ## [1] "Swan" "swam" "back" "again" "Well" "swum" "swan!"
cut
creates intervals that include an upper bound but not a lower bound. In order to get a count for the smallest value (3
), we need an extra break point below it (at 2
, for example). Try replacing table
with count
from the plyr
package:
scores<-
three_d6(
1000
)
bonuses<-
cut(
scores,
c(
2
,
3
,
5
,
8
,
12
,
15
,
17
,
18
),
labels=
-3
:3
)
table(
bonuses)
## bonuses ## -3 -2 -1 0 1 2 3 ## 4 39 186 486 233 47 5
if
to distinguish the conditions. The %in%
operator makes checking the conditions easier:
score<-
two_d6(
1
)
if
(
score%in%
c(
2
,
3
,
12
))
{
game_status<-
FALSE
point<-
NA
}
else
if
(
score%in%
c(
7
,
11
))
{
game_status<-
TRUE
point<-
NA
}
else
{
game_status<-
NA
point<-
score}
repeat
loop is most appropriate:
if
(
is.na(
game_status))
{
repeat({
score<-
two_d6(
1
)
if
(
score==
7
)
{
game_status<-
FALSE
break
}
else
if
(
score==
point)
{
game_status<-
TRUE
break
}
})
}
for
loop is most appropriate:
nchar_sea_shells<-
nchar(
sea_shells)
for
(
i in min(
nchar_sea_shells)
:max(
nchar_sea_shells))
{
message(
"These words have "
,
i,
" letters:"
)
print(
toString(
unique(
sea_shells[
nchar_sea_shells==
i])))
}
## These words have 2 letters:
## [1] "by, So, if, on"
## These words have 3 letters:
## [1] "She, sea, the, The, she, are, I'm"
## These words have 4 letters:
## [1] "sure"
## These words have 5 letters:
## [1] "sells"
## These words have 6 letters:
## [1] "shells, surely"
## These words have 7 letters:
## [1] ""
## These words have 8 letters:
## [1] "seashore"
## These words have 9 letters:
## [1] "seashells"
vapply
is the best choice:
vapply(
wayans,
length,
integer(
1
))
## Dwayne Kim Keenen Ivory Damon Kim Shawn ## 0 5 4 0 3 ## Marlon Nadia Elvira Diedre Vonnie ## 2 0 2 5 0
We can get a feel for the dataset by using str
, head
, and class
, and by reading the help page ?state.x77
:
## num [1:50, 1:8] 3615 365 2212 2110 21198 ... ## - attr(*, "dimnames")=List of 2 ## ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ... ## ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
## Population Income Illiteracy Life Exp Murder HS Grad Frost ## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 ## Alaska 365 6315 1.5 69.31 11.3 66.7 152 ## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 ## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 ## California 21198 5114 1.1 71.71 10.3 62.6 20 ## Colorado 2541 4884 0.7 72.06 6.8 63.9 166 ## Area ## Alabama 50708 ## Alaska 566432 ## Arizona 113417 ## Arkansas 51945 ## California 156361 ## Colorado 103766
## [1] "matrix"
Now that we know that the data is a matrix, the apply
function is most appropriate. We want to loop over columns, so we pass 2
for the dimension value:
## Population Income Illiteracy Life Exp Murder HS Grad ## 4246.420 4435.800 1.170 70.879 7.378 53.108 ## Frost Area ## 104.460 70735.880
## Population Income Illiteracy Life Exp Murder HS Grad ## 4.464e+03 6.145e+02 6.095e-01 1.342e+00 3.692e+00 8.077e+00 ## Frost Area ## 5.198e+01 8.533e+04
tapply
is the best choice from base R. ddply
from plyr
also works nicely:
with(
commute_data,
tapply(
time,
mode,
quantile,
prob=
0.75
))
## bike bus car train ## 63.55 55.62 42.33 36.29
ddply(
commute_data,
.
(
mode),
summarize,
time_p75=
quantile(
time,
0.75
))
## mode time_p75 ## 1 bike 63.55 ## 2 bus 55.62 ## 3 car 42.33 ## 4 train 36.29
install.packages(
"lubridate"
)
You might want to specify a library to install to.
installed.packages
function retrieves the details of the packages on your machine and their locations (the results will be different for your machine):
pkgs<-
installed.packages()
table(
pkgs[,
"LibPath"
])
## ## C:/Program Files/R/R-devel/library D:/R/library ## 29 169
strptime
to parse and strftime
to format, though there are many possibilities:
in_string<-
c(
"1940-07-07"
,
"1940-10-09"
,
"1942-06-18"
,
"1943-02-25"
)
(
parsed<-
strptime(
in_string,
"%Y-%m-%d"
))
## [1] "1940-07-07" "1940-10-09" "1942-06-18" "1943-02-25"
(
out_string<-
strftime(
parsed,
"%a %d %b %y"
))
## [1] "Sun 07 Jul 40" "Wed 09 Oct 40" "Thu 18 Jun 42" "Thu 25 Feb 43"
The ?Sys.timezone
help page suggests this code for importing the time zone file:
tzfile<-
file.path(
R.home(
"share"
),
"zoneinfo"
,
"zone.tab"
)
tzones<-
read.delim(
tzfile,
row.names=
NULL
,
header=
FALSE
,
col.names=
c(
"country"
,
"coords"
,
"name"
,
"comments"
),
as.is=
TRUE
,
fill=
TRUE
,
comment.char=
"#"
)
The best way to find your own time zone depends on where you are. Start with View(tzones)
and use subset
to narrow your search if you need to.
You can solve this using base R by accessing components of a POSIXlt
date, but it’s a little bit cleaner to use the lubridate
month
and day
functions:
zodiac_sign<-
function
(
x)
{
month_x<-
month(
x,
label=
TRUE
)
day_x<-
day(
x)
switch(
month_x,
Jan=
if
(
day_x<
20
)
"Capricorn"
else
"Aquarius"
,
Feb=
if
(
day_x<
19
)
"Aquarius"
else
"Pisces"
,
Mar=
if
(
day_x<
21
)
"Pisces"
else
"Aries"
,
Apr=
if
(
day_x<
20
)
"Aries"
else
"Taurus"
,
May=
if
(
day_x<
21
)
"Taurus"
else
"Gemini"
,
Jun=
if
(
day_x<
21
)
"Gemini"
else
"Cancer"
,
Jul=
if
(
day_x<
23
)
"Cancer"
else
"Leo"
,
Aug=
if
(
day_x<
23
)
"Leo"
else
"Virgo"
,
Sep=
if
(
day_x<
23
)
"Virgo"
else
"Libra"
,
Oct=
if
(
day_x<
23
)
"Libra"
else
"Scorpio"
,
Nov=
if
(
day_x<
22
)
"Scorpio"
else
"Sagittarius"
,
Dec=
if
(
day_x<
22
)
"Sagittarius"
else
"Capricorn"
)
}
#Usage is, for example,
nicolaus_copernicus_birth_date<-
as.Date(
"1473-02-19"
)
zodiac_sign(
nicolaus_copernicus_birth_date)
## [1] "Pisces"
Note that the switch
statement means that this implementation of the function isn’t vectorized. You can achieve vectorization by using lots of ifelse
statements or, more easily, by calling Vectorize(zodiac_sign)
.
system.file
to locate the file and read.csv
to import the data:
hafu_file<-
system.file(
"extdata"
,
"hafu.csv"
,
package=
"learningr"
)
hafu_data<-
read.csv(
hafu_file)
read.xlsx2
from the xlsx
package to import the data. The data is in the first (and only) sheet, and it is best to specify the types of data in each column:
library(
xlsx)
gonorrhoea_file<-
system.file(
"extdata"
,
"multi-drug-resistant gonorrhoea infection.xls"
,
package=
"learningr"
)
gonorrhoea_data<-
read.xlsx2(
gonorrhoea_file,
sheetIndex=
1
,
colClasses=
c(
"integer"
,
"character"
,
"character"
,
"numeric"
)
)
There are several ways of doing this using the RSQLite
package. One step at a time, we can connect like this:
library(
RSQLite)
driver<-
dbDriver(
"SQLite"
)
db_file<-
system.file(
"extdata"
,
"crabtag.sqlite"
,
package=
"learningr"
)
conn<-
dbConnect(
driver,
db_file)
query<-
"SELECT * FROM Daylog"
head(
daylog<-
dbGetQuery(
conn,
query))
## Tag ID Mission Day Date Max Temp Min Temp Max Depth Min Depth ## 1 A03401 0 08/08/2008 27.73 25.20 0.06 -0.07 ## 2 A03401 1 09/08/2008 25.20 23.86 0.06 -0.07 ## 3 A03401 2 10/08/2008 24.02 23.50 -0.07 -0.10 ## 4 A03401 3 11/08/2008 26.45 23.28 -0.04 -0.10 ## 5 A03401 4 12/08/2008 27.05 23.61 -0.10 -0.26 ## 6 A03401 5 13/08/2008 24.62 23.44 -0.04 -0.13 ## Batt Volts ## 1 3.06 ## 2 3.09 ## 3 3.09 ## 4 3.09 ## 5 3.09 ## 6 3.09
dbDisconnect(
conn)
## [1] TRUE
dbUnloadDriver(
driver)
## [1] TRUE
Or, more cheekily, we can use the function defined in Chapter 12:
head(
daylog<-
query_crab_tag_db(
"SELECT * FROM Daylog"
))
Or we can use the dbReadTable
function to simplify things a little further:
get_table_from_crab_tag_db<-
function
(
tbl)
{
driver<-
dbDriver(
"SQLite"
)
db_file<-
system.file(
"extdata"
,
"crabtag.sqlite"
,
package=
"learningr"
)
conn<-
dbConnect(
driver,
db_file)
on.exit(
{
dbDisconnect(
conn)
dbUnloadDriver(
driver)
}
)
dbReadTable(
conn,
tbl)
}
head(
daylog<-
get_table_from_crab_tag_db(
"Daylog"
))
## Tag.ID Mission.Day Date Max.Temp Min.Temp Max.Depth Min.Depth ## 1 A03401 0 08/08/2008 27.73 25.20 0.06 -0.07 ## 2 A03401 1 09/08/2008 25.20 23.86 0.06 -0.07 ## 3 A03401 2 10/08/2008 24.02 23.50 -0.07 -0.10 ## 4 A03401 3 11/08/2008 26.45 23.28 -0.04 -0.10 ## 5 A03401 4 12/08/2008 27.05 23.61 -0.10 -0.26 ## 6 A03401 5 13/08/2008 24.62 23.44 -0.04 -0.13 ## Batt.Volts ## 1 3.06 ## 2 3.09 ## 3 3.09 ## 4 3.09 ## 5 3.09 ## 6 3.09
To detect question marks, use str_detect
:
library(
stringr)
data(
hafu,
package=
"learningr"
)
hafu$
FathersNationalityIsUncertain<-
str_detect(
hafu$
Father,
fixed(
"?"
))
hafu$
MothersNationalityIsUncertain<-
str_detect(
hafu$
Mother,
fixed(
"?"
))
To replace those question marks, use str_replace
:
hafu$
Father<-
str_replace(
hafu$
Father,
fixed(
"?"
),
""
)
hafu$
Mother<-
str_replace(
hafu$
Mother,
fixed(
"?"
),
""
)
reshape2
package installed! The trick is to name the measurement variables, “Father” and “Mother”:
hafu_long<-
melt(
hafu,
measure.vars=
c(
"Father"
,
"Mother"
))
Using base R, table
gives us the counts, and a combination of sort
and head
allows us to find the most common values. Passing useNA = "always"
to table
means that NA
will always be included in the counts
vector:
top10<-
function
(
x)
{
counts<-
table(
x,
useNA=
"always"
)
head(
sort(
counts,
decreasing=
TRUE
),
10
)
}
top10(
hafu$
Mother)
## x ## Japanese <NA> English American French German Russian ## 120 50 29 23 20 12 10 ## Fantasy Italian Brazilian ## 8 4 3
The plyr
package provides an alternative solution that returns the answer as a data frame, which may be more useful for further manipulation of the result:
top10_v2<-
function
(
x)
{
counts<-
count(
x)
head(
arrange(
counts,
desc(
freq)),
10
)
}
top10_v2(
hafu$
Mother)
## x freq ## 1 Japanese 120 ## 2 <NA> 50 ## 3 English 29 ## 4 American 23 ## 5 French 20 ## 6 German 12 ## 7 Russian 10 ## 8 Fantasy 8 ## 9 Italian 4 ## 10 Brazilian 3
cor
calculates correlations, and defaults to Pearson correlations. Experiment with the method
argument for other types:
with(
obama_vs_mccain,
cor(
Unemployment,
Obama))
## [1] 0.2897
In base
/lattice
/ggplot2
order, we have:
with(
obama_vs_mccain,
plot(
Unemployment,
Obama))
xyplot(
Obama ~ Unemployment,
obama_vs_mccain)
ggplot(
obama_vs_mccain,
aes(
Unemployment,
Obama))
+
geom_point()
Histograms:
plot_numbers<-
1
:2
layout(
matrix(
plot_numbers,
ncol=
2
,
byrow=
TRUE
))
for
(
drug_use in c(
FALSE
,
TRUE
))
{
group_data<-
subset(
alpe_d_huez2,
DrugUse==
drug_use)
with(
group_data,
hist(
NumericTime))
}
histogram(
~ NumericTime|
DrugUse,
alpe_d_huez2)
ggplot(
alpe_d_huez2,
aes(
NumericTime))
+
geom_histogram(
binwidth=
2
)
+
facet_wrap(
~ DrugUse)
Box plots:
boxplot(
NumericTime ~ DrugUse,
alpe_d_huez2)
bwplot(
DrugUse ~ NumericTime,
alpe_d_huez2)
ggplot(
alpe_d_huez2,
aes(
DrugUse,
NumericTime,
group=
DrugUse))
+
geom_boxplot()
For simplicity, the answers to this are given using ggplot2
. Since this is a data exploration, there is no “right” answer: if the plot shows you something interesting, then it was worth drawing. When you have several “confounding factors” (in this case we have year/ethnicity/gender), there are lots of different possibilities for different plot types. In general, there are two strategies for dealing with many variables: start with an overall summary and add in variables, or start with everything and remove the variables that don’t look very interesting. We’ll use the second strategy.
The simplest technique for seeing the effects of each variable is to facet by everything:
ggplot(
gonorrhoea,
aes(
Age.Group,
Rate))
+
geom_bar(
stat=
"identity"
)
+
facet_wrap(
~ Year+
Ethnicity+
Gender)
There is a difference between the heights of the bars in each panel, particularly with respect to ethnicity, but with 50 panels it’s hard to see what is going on. We can simplify by plotting each year together on the same panel. We use group
to state which values belong in the same bar, fill
to give each bar a different fill color, and position = "dodge"
to put the bars next to each other rather than stacked one on top of another:
ggplot(
gonorrhoea,
aes(
Age.Group,
Rate,
group=
Year,
fill=
Year))
+
geom_bar(
stat=
"identity"
,
position=
"dodge"
)
+
facet_wrap(
~ Ethnicity+
Gender)
The reduced number of panels is better, but I find it hard to get much meaning from all those bars. Since most of the age groups are the same width (five years), we can cheat a little and draw a line plot with age on the x-axis. The plot will be a bit wrong on the righthand side of each panel, because the age groups are wider, but it’s close enough to be informative:
ggplot(
gonorrhoea,
aes(
Age.Group,
Rate,
group=
Year,
color=
Year))
+
geom_line()
+
facet_wrap(
~ Ethnicity+
Gender)
The lines are close together, so there doesn’t seem to be much of a time trend at all (though the time period is just five years). Since we have two variables to facet by, we can improve the plot slightly by using facet_grid
instead of facet_wrap
:
ggplot(
gonorrhoea,
aes(
Age.Group,
Rate,
group=
Year,
color=
Year))
+
geom_line()
+
facet_grid(
Ethnicity ~ Gender)
This clearly shows the effect of ethnicity on gonorrhoea infection rates: the curves are much higher in the “Non-Hispanic Blacks” and “American Indians & Alaskan Natives” groups. Since those groups dominate so much, it’s hard to see the effect of gender. By giving each row a different y-scale, it’s possible to neutralize the effect of ethnicity and see the difference between males and females more clearly:
ggplot(
gonorrhoea,
aes(
Age.Group,
Rate,
group=
Year,
color=
Year))
+
geom_line()
+
facet_grid(
Ethnicity ~ Gender,
scales=
"free_y"
)
Here you can see that women have higher infection rates than men (with the same age group and ethnicity) and have a more sustained peak, from 15 to 24 rather than 20 to 24.
Both the number of typos in this case, x
, and the average number of typos, lambda
, are 3
:
dpois(
3
,
3
)
## [1] 0.224
The quantile, q
, is 12 (months), the size
is 1, since we only need to get pregnant once, and the probability
each month is 0.25
:
pnbinom(
12
,
1
,
0.25
)
## [1] 0.9762
Straightforwardly, we just need to set the probability
to 0.9
:
qbirthday(
0.9
)
## [1] 41
We can either take a subset of the dataset (using the usual indexing method or the subset
function), or pass subsetting details to the subset
argument of lm
:
model1<-
lm(
Rate ~ Year+
Age.Group+
Ethnicity+
Gender,
gonorrhoea,
subset=
Age.Group%in%
c(
"15 to 19"
,
"20 to 24"
,
"25 to 29"
,
"30 to 34"
)
)
summary(
model1)
## ## Call: ## lm(formula = Rate ~ Year + Age.Group + Ethnicity + Gender, data = gonorrhoea, ## subset = Age.Group %in% c("15 to 19", "20 to 24", "25 to 29", ## "30 to 34")) ## ## Residuals: ## Min 1Q Median 3Q Max ## -774.0 -127.7 -10.3 106.2 857.7 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 9491.13 25191.00 0.38 0.7068 ## Year -4.55 12.54 -0.36 0.7173 ## Age.Group20 to 24 131.12 50.16 2.61 0.0097 ## Age.Group25 to 29 -124.60 50.16 -2.48 0.0138 ## Age.Group30 to 34 -259.83 50.16 -5.18 5.6e-07 ## EthnicityAsians & Pacific Islanders -212.76 56.08 -3.79 0.0002 ## EthnicityHispanics -124.06 56.08 -2.21 0.0281 ## EthnicityNon-Hispanic Blacks 1014.35 56.08 18.09 < 2e-16 ## EthnicityNon-Hispanic Whites -174.72 56.08 -3.12 0.0021 ## GenderMale -83.85 35.47 -2.36 0.0191 ## ## (Intercept) ## Year ## Age.Group20 to 24 ** ## Age.Group25 to 29 * ## Age.Group30 to 34 *** ## EthnicityAsians & Pacific Islanders *** ## EthnicityHispanics * ## EthnicityNon-Hispanic Blacks *** ## EthnicityNon-Hispanic Whites ** ## GenderMale * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 251 on 190 degrees of freedom ## Multiple R-squared: 0.798, Adjusted R-squared: 0.789 ## F-statistic: 83.7 on 9 and 190 DF, p-value: <2e-16
Year
isn’t significant, so we remove it:
model2<-
update(
model1,
~.
-
Year)
summary(
model2)
## ## Call: ## lm(formula = Rate ~ Age.Group + Ethnicity + Gender, data = gonorrhoea, ## subset = Age.Group %in% c("15 to 19", "20 to 24", "25 to 29", ## "30 to 34")) ## ## Residuals: ## Min 1Q Median 3Q Max ## -774.0 -129.3 -6.7 104.3 866.8 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 358.2 53.1 6.75 1.7e-10 ## Age.Group20 to 24 131.1 50.0 2.62 0.00949 ## Age.Group25 to 29 -124.6 50.0 -2.49 0.01363 ## Age.Group30 to 34 -259.8 50.0 -5.19 5.3e-07 ## EthnicityAsians & Pacific Islanders -212.8 55.9 -3.80 0.00019 ## EthnicityHispanics -124.1 55.9 -2.22 0.02777 ## EthnicityNon-Hispanic Blacks 1014.3 55.9 18.13 < 2e-16 ## EthnicityNon-Hispanic Whites -174.7 55.9 -3.12 0.00207 ## GenderMale -83.8 35.4 -2.37 0.01881 ## ## (Intercept) *** ## Age.Group20 to 24 ** ## Age.Group25 to 29 * ## Age.Group30 to 34 *** ## EthnicityAsians & Pacific Islanders *** ## EthnicityHispanics * ## EthnicityNon-Hispanic Blacks *** ## EthnicityNon-Hispanic Whites ** ## GenderMale * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 250 on 191 degrees of freedom ## Multiple R-squared: 0.798, Adjusted R-squared: 0.79 ## F-statistic: 94.5 on 8 and 191 DF, p-value: <2e-16
This time Gender
is significant, so we can stop. (Infection rates are similar across genders in children and the elderly, where sex isn’t the main method of transmision, but in young adults, women have higher infection rates than men.)
Most of the possible interaction terms aren’t very interesting. We get a much better fit by adding an ethnicity-gender interaction, although not all interaction terms are significant (output is verbose, and so not shown). If you’re still having fun analyzing, try creating a separate ethnicity indicator for black/non-black and use that as the interaction term with gender:
## ## Call: ## lm(formula = Rate ~ Age.Group + Ethnicity + Gender + Ethnicity:Gender, ## data = gonorrhoea, subset = Age.Group %in% c("15 to 19", ## "20 to 24", "25 to 29", "30 to 34")) ## ## Residuals: ## Min 1Q Median 3Q Max ## -806.0 -119.4 -9.4 105.3 834.8 ## ## Coefficients: ## Estimate Std. Error t value ## (Intercept) 405.1 63.9 6.34 ## Age.Group20 to 24 131.1 50.1 2.62 ## Age.Group25 to 29 -124.6 50.1 -2.49 ## Age.Group30 to 34 -259.8 50.1 -5.19 ## EthnicityAsians & Pacific Islanders -296.9 79.2 -3.75 ## EthnicityHispanics -197.2 79.2 -2.49 ## EthnicityNon-Hispanic Blacks 999.5 79.2 12.62 ## EthnicityNon-Hispanic Whites -237.0 79.2 -2.99 ## GenderMale -177.6 79.2 -2.24 ## EthnicityAsians & Pacific Islanders:GenderMale 168.2 112.0 1.50 ## EthnicityHispanics:GenderMale 146.2 112.0 1.30 ## EthnicityNon-Hispanic Blacks:GenderMale 29.7 112.0 0.27 ## EthnicityNon-Hispanic Whites:GenderMale 124.6 112.0 1.11 ## Pr(>|t|) ## (Intercept) 1.7e-09 *** ## Age.Group20 to 24 0.00960 ** ## Age.Group25 to 29 0.01377 * ## Age.Group30 to 34 5.6e-07 *** ## EthnicityAsians & Pacific Islanders 0.00024 *** ## EthnicityHispanics 0.01370 * ## EthnicityNon-Hispanic Blacks < 2e-16 *** ## EthnicityNon-Hispanic Whites 0.00315 ** ## GenderMale 0.02615 * ## EthnicityAsians & Pacific Islanders:GenderMale 0.13487 ## EthnicityHispanics:GenderMale 0.19361 ## EthnicityNon-Hispanic Blacks:GenderMale 0.79092 ## EthnicityNon-Hispanic Whites:GenderMale 0.26754 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 251 on 187 degrees of freedom ## Multiple R-squared: 0.802, Adjusted R-squared: 0.789 ## F-statistic: 63.2 on 12 and 187 DF, p-value: <2e-16
First, the setup. Get the package installed in the usual way:
install.packages(
"betareg"
)
We need to rescale the response variable to be between 0
and 1
(rather than 0
and 100
), because that is the range of the beta distribution:
ovm<-
within(
obama_vs_mccain,
Obama<-
Obama/
100
)
Now we’re ready to run the model. It’s just the same as running a linear regression, but we call betareg
instead of lm
. I’m arbitrarily looking at Black
as the ethnicity group and Protestant
as the religion:
library(
betareg)
beta_model1<-
betareg(
Obama ~ Turnout+
Population+
Unemployment+
Urbanization+
Black+
Protestant,
ovm,
subset=
State!=
"District of Columbia"
)
summary(
beta_model1)
## ## Call: ## betareg(formula = Obama ~ Turnout + Population + Unemployment + ## Urbanization + Black + Protestant, data = ovm, subset = State != ## "District of Columbia") ## ## Standardized weighted residuals 2: ## Min 1Q Median 3Q Max ## -2.834 -0.457 -0.062 0.771 2.044 ## ## Coefficients (mean model with logit link): ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -5.52e-01 4.99e-01 -1.11 0.2689 ## Turnout 1.69e-02 5.91e-03 2.86 0.0042 ** ## Population 1.66e-09 5.34e-09 0.31 0.7566 ## Unemployment 5.74e-02 2.31e-02 2.48 0.0132 * ## Urbanization -1.82e-05 1.54e-04 -0.12 0.9059 ## Black 8.18e-03 4.27e-03 1.91 0.0558 . ## Protestant -1.78e-02 3.17e-03 -5.62 1.9e-08 *** ## ## Phi coefficients (precision model with identity link): ## Estimate Std. Error z value Pr(>|z|) ## (phi) 109.5 23.2 4.71 2.5e-06 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Type of estimator: ML (maximum likelihood) ## Log-likelihood: 72.1 on 8 Df ## Pseudo R-squared: 0.723 ## Number of iterations: 36 (BFGS) + 3 (Fisher scoring)
Urbanization
is nonsignificant, so we remove it, using the same technique as for lm
:
beta_model2<-
update(
beta_model1,
~.
-
Urbanization)
summary(
beta_model2)
## ## Call: ## betareg(formula = Obama ~ Turnout + Population + Unemployment + ## Black + Protestant, data = ovm, subset = State != "District of Columbia") ## ## Standardized weighted residuals 2: ## Min 1Q Median 3Q Max ## -2.831 -0.457 -0.053 0.774 2.007 ## ## Coefficients (mean model with logit link): ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -5.69e-01 4.77e-01 -1.19 0.2327 ## Turnout 1.70e-02 5.86e-03 2.90 0.0037 ** ## Population 1.73e-09 5.31e-09 0.32 0.7452 ## Unemployment 5.70e-02 2.29e-02 2.48 0.0130 * ## Black 7.93e-03 3.73e-03 2.13 0.0334 * ## Protestant -1.76e-02 2.48e-03 -7.09 1.3e-12 *** ## ## Phi coefficients (precision model with identity link): ## Estimate Std. Error z value Pr(>|z|) ## (phi) 109.4 23.2 4.71 2.5e-06 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Type of estimator: ML (maximum likelihood) ## Log-likelihood: 72.1 on 7 Df ## Pseudo R-squared: 0.723 ## Number of iterations: 31 (BFGS) + 3 (Fisher scoring)
Similarly, Population
has to go:
beta_model3<-
update(
beta_model2,
~.
-
Population)
summary(
beta_model3)
## ## Call: ## betareg(formula = Obama ~ Turnout + Unemployment + Black + Protestant, ## data = ovm, subset = State != "District of Columbia") ## ## Standardized weighted residuals 2: ## Min 1Q Median 3Q Max ## -2.828 -0.458 0.043 0.742 1.935 ## ## Coefficients (mean model with logit link): ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.55577 0.47567 -1.17 0.2427 ## Turnout 0.01686 0.00585 2.88 0.0040 ** ## Unemployment 0.05964 0.02145 2.78 0.0054 ** ## Black 0.00820 0.00364 2.26 0.0241 * ## Protestant -0.01779 0.00240 -7.42 1.2e-13 *** ## ## Phi coefficients (precision model with identity link): ## Estimate Std. Error z value Pr(>|z|) ## (phi) 109.2 23.2 4.71 2.5e-06 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Type of estimator: ML (maximum likelihood) ## Log-likelihood: 72.1 on 6 Df ## Pseudo R-squared: 0.723 ## Number of iterations: 27 (BFGS) + 2 (Fisher scoring)
Plotting can be done with exactly the same code as with lm
:
plot_numbers<-
1
:6
layout(
matrix(
plot_numbers,
ncol=
2
,
byrow=
TRUE
))
plot(
beta_model3,
plot_numbers)
There is no prescription for the exact rules that must be followed to check and handle user input. In general, if input is in the wrong form, it is best to try to convert it to the right one, warning the user in the process. Thus, for nonnumeric inputs, we can try to convert them to be numeric with a warning.
For nonpositive values, we can try substituting an in-range value or pretend the value was missing, but this would make the answer wrong. In this case, it is best to throw an error:
harmonic_mean<-
function
(
x,
na.rm=
FALSE
)
{
if
(
!is.numeric(
x))
{
warning(
"Coercing 'x' to be numeric."
)
x<-
as.numeric(
x)
}
if
(
any(
!is.na(
x)
&
x<=
0
))
{
stop(
"'x' contains non-positive values"
)
}
1
/
mean(
1
/
x,
na.rm=
na.rm)
}
Here’s the RUnit
version. To test missing values, we need to consider the cases with na.rm
being TRUE
and na.rm
being FALSE
. To test that a warning has been thrown, we use the trick of changing the warn
option to 2
, meaning that warnings are treated as errors. For testing nonpositive numbers, we use the boundary case of zero:
test.harmonic_mean.1_2_4.returns_12_over_7<-
function
()
{
expected<-
12
/
7
actual<-
harmonic_mean(
c(
1
,
2
,
4
))
checkEqualsNumeric(
expected,
actual)
}
test.harmonic_mean.no_inputs.throws_error<-
function
()
{
checkException(
harmonic_mean())
}
test.harmonic_mean.some_missing.returns_na<-
function
()
{
expected<-
NA_real_ actual<-
harmonic_mean(
c(
1
,
2
,
4
,
NA
))
checkEqualsNumeric(
expected,
actual)
}
test.harmonic_mean.some_missing_with_nas_removed.returns_12_over_7<-
function
()
{
expected<-
12
/
7
actual<-
harmonic_mean(
c(
1
,
2
,
4
,
NA
),
na.rm=
TRUE
)
checkEqualsNumeric(
expected,
actual)
}
test.harmonic_mean.non_numeric_input.throws_warning<-
function
()
{
old_ops<-
options(
warn=
2
)
on.exit(
options(
old_ops))
checkException(
harmonic_mean(
"1"
))
}
test.harmonic_mean.zero_inputs.throws_error<-
function
()
{
checkException(
harmonic_mean(
0
))
}
The testthat
translation is straightforward. Warnings are easier to test for, using the expect_warning
function:
expect_equal(
12
/
7
,
harmonic_mean(
c(
1
,
2
,
4
)))
expect_error(
harmonic_mean())
expect_equal(
NA_real_,
harmonic_mean(
c(
1
,
2
,
4
,
NA
)))
expect_equal(
12
/
7
,
harmonic_mean(
c(
1
,
2
,
4
,
NA
),
na.rm=
TRUE
))
expect_warning(
harmonic_mean(
"1"
))
expect_error(
harmonic_mean(
0
))
Here’s the updated harmonic_mean
function:
harmonic_mean<-
function
(
x,
na.rm=
FALSE
)
{
if
(
!is.numeric(
x))
{
warning(
"Coercing 'x' to be numeric."
)
x<-
as.numeric(
x)
}
if
(
any(
!is.na(
x)
&
x<=
0
))
{
stop(
"'x' contains non-positive values"
)
}
result<-
1
/
mean(
1
/
x,
na.rm=
na.rm)
class(
result)
<-
"harmonic"
result}
To make an S3 method, the name must be of the form function.class
; in this case, print.harmonic
. The contents can be generated with other print
functions, but here we use the lower-level cat
function:
print.harmonic<-
function
(
x,
...
)
{
cat(
"The harmonic mean is"
,
x,
"\n"
,
...
)
}
Creating the function and the data frame is straightforward enough:
sum_of_squares<-
function
(
n)
{
n*
(
n+
1
)
*
(
2
*
n+
1
)
/
6
}
x<-
1
:10
squares_data<-
data.frame(
x=
x,
y=
sum_of_squares(
x))
In your call to package.skeleton
, you need to think about which directory you want to create the package in:
package.skeleton(
"squares"
,
c(
"sum_of_squares"
,
"squares_data"
))
The function documentation goes in R/sum_of_squares.R, or similar:
#' Sum of Squares
#'
#' Calculates the sum of squares of the first \code{n} natural numbers.
#'
#' @param n A positive integer.
#' @return An integer equal to the sum of the squares of the first \code{n}
#' natural numbers.
#' @author Richie Cotton.
#' @seealso \code{\link[base]{sum}}
#' @examples
#' sum_of_squares(1:20)
#' @keywords misc
#' @export
The package documentation goes in R/squares-package.R:
#' squares: Sums of squares.
#'
#' A test package that contains data on the sum of squares of natural
#' numbers, and a function to calculate more values.
#'
#' @author Richie Cotton
#' @docType package
#' @name squares
#' @aliases squares squares-package
#' @keywords package
NULL
The data documentation goes in the same file, or in R/squares-data.R:
#' Sum of squares dataset
#'
#' The sum of squares of natural numbers.
#' \itemize{
#' \item{x}{Natural numbers.}
#' \item{y}{The sum of squares from 1 to \code{x}.}
#' }
#'
#' @docType data
#' @keywords datasets
#' @name squares_data
#' @usage data(squares_data)
#' @format A data frame with 10 rows and 2 variables.
NULL
check(
"squares"
)
build(
"squares"
)