3.2 Data types
3.2.1 Atomic
In most cases, each atomic element has a type (mode) of
numeric
: numberlogical
: TRUE or FALSE (T or F for shortcuts)character
: character stringfactor
: a level of categorical variable
Other types include date and nonexistent NULL
. The factor is also a class of its own, meaning that many R functions apply operations that are specific to the factor class.
# assess objects 123, "abc", and TRUE for their types
str(123) # str() returns the structure
## num 123
str("abc")
## chr "abc"
str(TRUE)
## logi TRUE
c(is.numeric(123), is.numeric("abc"), is.numeric(TRUE))
## [1] TRUE FALSE FALSE
c(is.logical(123), is.logical("abc"), is.logical(TRUE))
## [1] FALSE FALSE TRUE
c(is.character(123), is.character("abc"), is.character(TRUE))
## [1] FALSE TRUE FALSE
# "<-" means an assignment from right to left
factor1 <- as.factor(c(1,2,3)) # Looks like numeric but not
factor1
## [1] 1 2 3
## Levels: 1 2 3
factor2 <- as.factor(c("a","b","c")) # Looks like characters but not
factor2
## [1] a b c
## Levels: a b c
factor3 <- as.factor(c(TRUE,FALSE,T)) # Looks like logicals but not
factor3
## [1] TRUE FALSE TRUE
## Levels: FALSE TRUE
c(is.factor(factor1[1]), is.factor(factor2[1]), is.factor(factor3[1]))
## [1] TRUE TRUE TRUE
# Extract the first element (factor1[1] etc.)
factor1[1]
## [1] 1
## Levels: 1 2 3
factor2[2]
## [1] b
## Levels: a b c
factor3[3]
## [1] TRUE
## Levels: FALSE TRUE
NULL
has zero-length. Also, empty numeric, logical, and character objects have zero-length.
length(NULL)
## [1] 0
length(numeric(0)) # numeric(N) returns a vector of N zeros
## [1] 0
length(logical(0)) # logical(N) returns a vector of N FALSE objects
## [1] 0
length(character(0)) # character(N) returns a vector of N "" objects
## [1] 0
Each vector has a type of numeric
, logical
, character
, or factor
. Each matrix has a type of numeric
, logical
, or character
. A data frame can contain mixed types across columns where each column (e.g., a variable) has a type of numeric
, logical
, character
or factor
.
vector1 <- c(1, NA, 2, 3) # read as numeric
vector1
## [1] 1 NA 2 3
vector2 <- c(TRUE, FALSE, T, F) # read as logical
vector2
## [1] TRUE FALSE TRUE FALSE
vector3 <- c(1, NA, "abc", TRUE, "TRUE") # read as character
vector3
## [1] "1" NA "abc" "TRUE" "TRUE"
vector4 <- as.factor(c(1, NA, "abc", TRUE, "TRUE")) # read as factor
vector4
## [1] 1 <NA> abc TRUE TRUE
## Levels: 1 abc TRUE
matrix1 <- matrix(c(1:6), nrow = 3) # read as numeric
matrix1
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
matrix2 <- matrix(c(TRUE,FALSE,rep(T,3),F), nrow = 3) # read as logical
matrix2
## [,1] [,2]
## [1,] TRUE TRUE
## [2,] FALSE TRUE
## [3,] TRUE FALSE
matrix3 <- matrix(c(1,2,3,"a","b","abc"), nrow = 3) # read as character
matrix3
## [,1] [,2]
## [1,] "1" "a"
## [2,] "2" "b"
## [3,] "3" "abc"
df1 <- data.frame(
num = c(1,2,3), # read as numeric
fac1 = c("a","b","abc"), # read as factor
logi = c(TRUE, FALSE, T), # read as logical
fac2 = c(1,"a",TRUE) # read as factor
)
df1
## num fac1 logi fac2
## 1 1 a TRUE 1
## 2 2 b FALSE a
## 3 3 abc TRUE TRUE
df1$num # "$" symbol is used to extract a column
## [1] 1 2 3
df1$fac1 # character type is converted into a factor
## [1] a b abc
## Levels: a abc b
df1$logi
## [1] TRUE FALSE TRUE
df1$fac2 # mixed types within a column is converted into a factor
## [1] 1 a TRUE
## Levels: 1 a TRUE
# additional argument "stringsAsFactors = FALSE" preserves character types.
df2 <- data.frame(
num = c(1,2,3), # read as numeric
char = c("a","b","abc"), # read as character
logi = c(TRUE, FALSE, T), # read as logical
fac2 = as.factor(c(1,"a",TRUE)), # read as factor
stringsAsFactors = FALSE
)
df2
## num char logi fac2
## 1 1 a TRUE 1
## 2 2 b FALSE a
## 3 3 abc TRUE TRUE
df2$num
## [1] 1 2 3
df2$char
## [1] "a" "b" "abc"
df2$logi
## [1] TRUE FALSE TRUE
df2$fac2
## [1] 1 a TRUE
## Levels: 1 a TRUE
3.2.2 Factor
A factor object is defined with a set of categorical levels, which may be labeled. The levels are either ordered (defined by ordered()
) or unordered (defined by factor()
). Ordered factor objects are treated in the specific order by certain statistical and graphical procedures.
# We will convert the columns of df into factors
df <- data.frame(
fac1 = c(0,1,1,4,4,2,2,3),
fac2 = c(1,2,3,1,1,2,2,3),
fac3 = c(4,2,3,4,4,2,2,3)
)
# convert fac1 to ordered factors
df$fac1 <- ordered(df$fac1,
levels = c(0,4,3,2,1) # defines the order
)
df$fac1
## [1] 0 1 1 4 4 2 2 3
## Levels: 0 < 4 < 3 < 2 < 1
summary(df$fac1) # gives the table of counts for each level
## 0 4 3 2 1
## 1 2 1 2 2
# convert fac2 to unordered factors with labels
df$fac2 <- factor(df$fac2,
levels = c(1,2,3), # no particular order
# attach labels to factors: 1=red, 2=blue, 3=green
labels = c("red", "blue", "green")
)
df$fac2
## [1] red blue green red red blue blue green
## Levels: red blue green
summary(df$fac2)
## red blue green
## 3 3 2
# convert fac3 to ordered factors with labels
df$fac3 <- ordered(df$fac3,
levels = c(2,3,4),
# attach labels to factors: 2=Low, 3=Medium, 4=High
labels = c("Low", "Medium", "High")
)
df$fac3
## [1] High Low Medium High High Low Low Medium
## Levels: Low < Medium < High
summary(df$fac3)
## Low Medium High
## 3 2 3
3.2.3 Matrix
matrix()
defines a matrix from a vector. The default is to arrange the vector by column (byrow = FALSE
).
# byrow = FALSE (the default)
matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = FALSE, dimnames = NULL)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
# byrow = TRUE
matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = TRUE, dimnames = NULL)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
# give row and column names to a matrix
mat1 <- matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = FALSE,
dimnames = list(c("r1","r2"), c("c1","c2","c3")))
mat1
## c1 c2 c3
## r1 1 3 5
## r2 2 4 6
dim(mat1) # dimension: row by column
## [1] 2 3
colnames(mat1)
## [1] "c1" "c2" "c3"
rownames(mat1)
## [1] "r1" "r2"
colnames(mat1) <- c("v1","v2","v3") # change column names by assignment "<-"
mat1
## v1 v2 v3
## r1 1 3 5
## r2 2 4 6
# R makes a guess when only nrow or ncol is supplied
matrix(data = c(1:6), nrow = 2)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
matrix(data = c(1:6), ncol = 3)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
# combine matrices by column via "cbind()" or by row via "rbind()"
cbind(mat1,mat1)
## v1 v2 v3 v1 v2 v3
## r1 1 3 5 1 3 5
## r2 2 4 6 2 4 6
rbind(mat1,mat1)
## v1 v2 v3
## r1 1 3 5
## r2 2 4 6
## r1 1 3 5
## r2 2 4 6
There are recycling rules (which does/controls what?) in R.
# the vector shorter than the length of all elements of a matrix
matrix(data = c(1:4), nrow = 2, ncol= 3)
## Warning in matrix(data = c(1:4), nrow = 2, ncol = 3): data length [4] is
## not a sub-multiple or multiple of the number of columns [3]
## [,1] [,2] [,3]
## [1,] 1 3 1
## [2,] 2 4 2
# R treats a scaler as a vector of length that conforms cbind() or rbind()
cbind(mat1, colA = 1)
## v1 v2 v3 colA
## r1 1 3 5 1
## r2 2 4 6 1
rbind(mat1, rowA= 1, rowB= 2, rowC= 3)
## v1 v2 v3
## r1 1 3 5
## r2 2 4 6
## rowA 1 1 1
## rowB 2 2 2
## rowC 3 3 3
To replace elements of a matrix, we can use assignment operator <-
.
mat1[1,1] <- 10
mat1
## v1 v2 v3
## r1 10 3 5
## r2 2 4 6
mat1[,2] <- c(7,8)
mat1
## v1 v2 v3
## r1 10 7 5
## r2 2 8 6
mat1[,1] <- 0 # recycling rule
mat1
## v1 v2 v3
## r1 0 7 5
## r2 0 8 6
Matrix allows for easy extraction for rows and columns separated by comma.
mat1
## v1 v2 v3
## r1 0 7 5
## r2 0 8 6
mat1[1, ] # row = 1 and all columns
## v1 v2 v3
## 0 7 5
mat1[, 1] # all rows and col = 1
## r1 r2
## 0 0
mat1[c(TRUE,FALSE),] # by a logical vector
## v1 v2 v3
## 0 7 5
mat1[, c(TRUE,FALSE)]
## v1 v3
## r1 0 5
## r2 0 6
mat1[2,3] # row = 2 and col = 3
## [1] 6
mat1[1:2, 2:3] # row = 1:2 and col = 2:3
## v2 v3
## r1 7 5
## r2 8 6
mat1[1:2, 2:3][2,2] # subset of a subset
## [1] 6
mat1[, 1][2] # vector extraction is done with one-dimensional index
## r2
## 0
Important: when a single row or column is extracted, it gets converted to a vector with no dimension.
mat1[1,]
## v1 v2 v3
## 0 7 5
is.matrix(mat1[1, ])
## [1] FALSE
dim(mat1[1,])
## NULL
length(mat1[1, ])
## [1] 3
# to keep a row or column vector structure, use drop = FALSE
mat1[1,, drop = FALSE]
## v1 v2 v3
## r1 0 7 5
is.matrix(mat1[1,,drop = FALSE])
## [1] TRUE
dim(mat1[1,,drop = FALSE])
## [1] 1 3
length(mat1[1,,drop = FALSE])
## [1] 3
mat1[,1, drop = FALSE]
## v1
## r1 0
## r2 0
is.matrix(mat1[,1,drop = FALSE])
## [1] TRUE
dim(mat1[,1,drop = FALSE])
## [1] 2 1
length(mat1[,1,drop = FALSE])
## [1] 2
Another way of extraction from a matrix is to use row or column names.
mat1[,'v1']
## r1 r2
## 0 0
mat1[,c('v1','v3')]
## v1 v3
## r1 0 5
## r2 0 6
mat1['r2',,drop= FALSE]
## v1 v2 v3
## r2 0 8 6
apply()
applies a function for a specified margin (dimension index number) of the matrix.
mat1
## v1 v2 v3
## r1 0 7 5
## r2 0 8 6
apply(mat1,1,mean) # dimension 1 (across rows)
## r1 r2
## 4.000000 4.666667
apply(mat1,2,mean) # dimension 2 (across columns)
## v1 v2 v3
## 0.0 7.5 5.5
# one can write a custom function inside apply(). (called annonymous function)
# Its argument corresponds to the row or column vector passed by apply().
apply(mat1,2, function(x) sum(x)/length(x) ) # x is the internal vector name
## v1 v2 v3
## 0.0 7.5 5.5
ans1 <- apply(mat1,2, function(x) {
avg = mean(x)
sd = sd(x)
# return the results as a list
list(Avg = avg, Sd = sd)
}
)
unlist(ans1[[2]]) # results for the second column
## Avg Sd
## 7.5000000 0.7071068
unlist(ans1[[3]]) # results for the third column
## Avg Sd
## 5.5000000 0.7071068
Arrays are a generalization of matrices and can have more than 2 dimensions.
array(c(1:18), c(2,3,3)) # dimension 2 by 2 by 3
## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 7 9 11
## [2,] 8 10 12
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 13 15 17
## [2,] 14 16 18
array(c(1:9), c(2,3,3)) # R recycles the vector
## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 7 9 2
## [2,] 8 1 3
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 4 6 8
## [2,] 5 7 9
3.2.4 Data Frame
A data frame is similar to a matrix, but it accepts multiple types (modes) of variables across columns (e.g., a dataset in typical data analysis programs like SAS, SPSS, Stata etc.). In some cases matrices and data frames may be treated interchangeably, but generally they need to be distinguished. Data manipulation functions are often written for data frames, while some base R functions are written for matrices.
mymat1 <- matrix(data = c(1:6), nrow = 2, ncol = 3,
dimnames = list(c("r1","r2"), c("c1","c2","c3")))
mymat1
## c1 c2 c3
## r1 1 3 5
## r2 2 4 6
class(mymat1)
## [1] "matrix"
colnames(mymat1)
## [1] "c1" "c2" "c3"
names(mymat1)
## NULL
mydf1 <- data.frame(
mymat1,
num = c(1,2),
fac1 = c("a","abc"),
logi = c(TRUE, FALSE),
fac2 = c(1,"a")
)
mydf1
## c1 c2 c3 num fac1 logi fac2
## r1 1 3 5 1 a TRUE 1
## r2 2 4 6 2 abc FALSE a
class(mydf1)
## [1] "data.frame"
colnames(mydf1)
## [1] "c1" "c2" "c3" "num" "fac1" "logi" "fac2"
names(mydf1) # colnames and names are the same
## [1] "c1" "c2" "c3" "num" "fac1" "logi" "fac2"
Extracting elements from a data frame is similar to extracting from a matrix, but there are a few additional methods.
mydf1[1,] # row = 1 and all columns
## c1 c2 c3 num fac1 logi fac2
## r1 1 3 5 1 a TRUE 1
mydf1[,1] # all rows and col = 1
## [1] 1 2
# data frame preserves dimension while extracting a row but not a column
dim(mydf1[1,])
## [1] 1 7
dim(mydf1[,1])
## NULL
dim(mydf1[,1,drop=FALSE]) # use drop = FALSE to keep a column vector
## [1] 2 1
mydf1[,1,drop=FALSE]
## c1
## r1 1
## r2 2
mydf1[, c('c1','num','logi')]
## c1 num logi
## r1 1 1 TRUE
## r2 2 2 FALSE
class(mydf1[, c('c1','num','logi')])
## [1] "data.frame"
# extraction by column name with "$" symbol: df$varname
mydf1$c1
## [1] 1 2
dim(mydf1$c1)
## NULL
# one can use quote ' ' or " " as well
mydf1$'c1'
## [1] 1 2
# similarly, extraction by column name with [[ ]]: df[['varname']]
mydf1[['c1']]
## [1] 1 2
dim(mydf1[['c1']])
## NULL
# or by index
mydf1[[1]]
## [1] 1 2
# [[ ]] method is useful when passing a variable name as a string
set_to_na <- function(df, var) {
df[[var]] <- NA
df
}
mydf1
## c1 c2 c3 num fac1 logi fac2
## r1 1 3 5 1 a TRUE 1
## r2 2 4 6 2 abc FALSE a
mydf2 <- set_to_na(mydf1, "c2")
mydf2
## c1 c2 c3 num fac1 logi fac2
## r1 1 NA 5 1 a TRUE 1
## r2 2 NA 6 2 abc FALSE a
# add a variable
mydf1$newvar <- c(4, 4)
mydf1
## c1 c2 c3 num fac1 logi fac2 newvar
## r1 1 3 5 1 a TRUE 1 4
## r2 2 4 6 2 abc FALSE a 4
mydf1$newvar2 <- mydf1$c2 + mydf1$c3
mydf1
## c1 c2 c3 num fac1 logi fac2 newvar newvar2
## r1 1 3 5 1 a TRUE 1 4 8
## r2 2 4 6 2 abc FALSE a 4 10
apply()
may not work well with data frames since data frames are not exactly matrices. We can use simplified apply sapply()
or list apply lapply()
instead.
mydf1
## c1 c2 c3 num fac1 logi fac2 newvar newvar2
## r1 1 3 5 1 a TRUE 1 4 8
## r2 2 4 6 2 abc FALSE a 4 10
# sapply()
idx_num <- sapply(mydf1, is.numeric)
idx_num
## c1 c2 c3 num fac1 logi fac2 newvar newvar2
## TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
apply(mydf1[,idx_num], 2, mean)
## c1 c2 c3 num newvar newvar2
## 1.5 3.5 5.5 1.5 4.0 9.0
sapply(mydf1[,idx_num], mean)
## c1 c2 c3 num newvar newvar2
## 1.5 3.5 5.5 1.5 4.0 9.0
# lapply()
idx_num2 <- unlist(lapply(mydf1, is.numeric))
idx_num2
## c1 c2 c3 num fac1 logi fac2 newvar newvar2
## TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
unlist(lapply(mydf1[,idx_num2], mean))
## c1 c2 c3 num newvar newvar2
## 1.5 3.5 5.5 1.5 4.0 9.0
3.2.5 List
A list is an ordered collection of (possibly unrelated) objects. The objects in a list are referenced by [[1]], [[2]], …, or [[‘var1’]], [[‘var2’]], … etc.
mylist1 <- list(v1 = c(1,2,3),
v2 = c("a","b"),
v3 = factor(c("blue","red","orange","yellow")),
v4 = data.frame( u1 = c(1:3), u2 = c("p","q","r"))
)
mylist1
## $v1
## [1] 1 2 3
##
## $v2
## [1] "a" "b"
##
## $v3
## [1] blue red orange yellow
## Levels: blue orange red yellow
##
## $v4
## u1 u2
## 1 1 p
## 2 2 q
## 3 3 r
# extraction
mylist1[[1]]
## [1] 1 2 3
mylist1[["v2"]]
## [1] "a" "b"
mylist1$v3
## [1] blue red orange yellow
## Levels: blue orange red yellow
mylist1$v4$u2
## [1] p q r
## Levels: p q r
# assignment
mylist1$v5 <- c("a",NA)
mylist1$v5
## [1] "a" NA
# a list can be nested
mylist1$v6 <- list(y1 = c(2,9), y2 = c(0,0,0,1))
mylist1$v6
## $y1
## [1] 2 9
##
## $y2
## [1] 0 0 0 1
lapply()
is very versatile since the items in a list can be completely unrelated.
unlist(lapply(mylist1, class))
## v1 v2 v3 v4 v5
## "numeric" "character" "factor" "data.frame" "character"
## v6
## "list"
unlist(lapply(mylist1, attributes)) # some variables have attributes
## v3.levels1 v3.levels2 v3.levels3 v3.levels4 v3.class
## "blue" "orange" "red" "yellow" "factor"
## v4.names1 v4.names2 v4.row.names1 v4.row.names2 v4.row.names3
## "u1" "u2" "1" "2" "3"
## v4.class v6.names1 v6.names2
## "data.frame" "y1" "y2"
lapply(mylist1, function(x) {
if (is.numeric(x)) return(summary(x))
if (is.character(x)) return(x)
if (is.factor(x)) return(table(x))
if (is.data.frame(x)) return(head(x))
if (is.list(x)) return(unlist(lapply(x,class)))
}
)
## $v1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.5 2.0 2.0 2.5 3.0
##
## $v2
## [1] "a" "b"
##
## $v3
## x
## blue orange red yellow
## 1 1 1 1
##
## $v4
## u1 u2
## 1 1 p
## 2 2 q
## 3 3 r
##
## $v5
## [1] "a" NA
##
## $v6
## y1 y2
## "numeric" "numeric"