3.2 Data types

3.2.1 Atomic

In most cases, each atomic element has a type (mode) of

  • numeric: number

  • logical: TRUE or FALSE (T or F for shortcuts)

  • character: character string

  • factor: a level of categorical variable

Other types include date and nonexistent NULL. The factor is also a class of its own, meaning that many R functions apply operations that are specific to the factor class.

# assess objects 123, "abc", and TRUE for their types 
str(123)  # str() returns the structure
##  num 123
str("abc")
##  chr "abc"
str(TRUE)
##  logi TRUE
c(is.numeric(123), is.numeric("abc"), is.numeric(TRUE))
## [1]  TRUE FALSE FALSE
c(is.logical(123), is.logical("abc"), is.logical(TRUE))
## [1] FALSE FALSE  TRUE
c(is.character(123), is.character("abc"), is.character(TRUE))
## [1] FALSE  TRUE FALSE
# "<-" means an assignment from right to left
factor1 <- as.factor(c(1,2,3)) # Looks like numeric but not
factor1
## [1] 1 2 3
## Levels: 1 2 3
factor2 <- as.factor(c("a","b","c"))  # Looks like characters but not
factor2
## [1] a b c
## Levels: a b c
factor3 <- as.factor(c(TRUE,FALSE,T)) # Looks like logicals but not
factor3
## [1] TRUE  FALSE TRUE 
## Levels: FALSE TRUE
c(is.factor(factor1[1]), is.factor(factor2[1]), is.factor(factor3[1]))
## [1] TRUE TRUE TRUE
# Extract the first element (factor1[1] etc.) 
factor1[1]
## [1] 1
## Levels: 1 2 3
factor2[2]
## [1] b
## Levels: a b c
factor3[3]
## [1] TRUE
## Levels: FALSE TRUE

NULL has zero-length. Also, empty numeric, logical, and character objects have zero-length.

length(NULL) 
## [1] 0
length(numeric(0))  # numeric(N) returns a vector of N zeros 
## [1] 0
length(logical(0))  # logical(N) returns a vector of N FALSE objects
## [1] 0
length(character(0)) # character(N) returns a vector of N "" objects
## [1] 0

Each vector has a type of numeric, logical, character, or factor. Each matrix has a type of numeric, logical, or character. A data frame can contain mixed types across columns where each column (e.g., a variable) has a type of numeric, logical, character or factor.

vector1 <- c(1, NA, 2, 3) # read as numeric
vector1
## [1]  1 NA  2  3
vector2 <- c(TRUE, FALSE, T, F) # read as logical
vector2
## [1]  TRUE FALSE  TRUE FALSE
vector3 <- c(1, NA, "abc", TRUE, "TRUE") # read as character
vector3
## [1] "1"    NA     "abc"  "TRUE" "TRUE"
vector4 <- as.factor(c(1, NA, "abc", TRUE, "TRUE")) # read as factor 
vector4
## [1] 1    <NA> abc  TRUE TRUE
## Levels: 1 abc TRUE
matrix1 <- matrix(c(1:6), nrow = 3) # read as numeric
matrix1
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
matrix2 <- matrix(c(TRUE,FALSE,rep(T,3),F), nrow = 3)  # read as logical
matrix2
##       [,1]  [,2]
## [1,]  TRUE  TRUE
## [2,] FALSE  TRUE
## [3,]  TRUE FALSE
matrix3 <- matrix(c(1,2,3,"a","b","abc"), nrow = 3) # read as character
matrix3
##      [,1] [,2] 
## [1,] "1"  "a"  
## [2,] "2"  "b"  
## [3,] "3"  "abc"
df1 <- data.frame(
        num  = c(1,2,3),           # read as numeric
        fac1 = c("a","b","abc"),   # read as factor
        logi = c(TRUE, FALSE, T),  # read as logical
        fac2  = c(1,"a",TRUE)      # read as factor
       )  
df1
##   num fac1  logi fac2
## 1   1    a  TRUE    1
## 2   2    b FALSE    a
## 3   3  abc  TRUE TRUE
df1$num   # "$" symbol is used to extract a column 
## [1] 1 2 3
df1$fac1  # character type is converted into a factor 
## [1] a   b   abc
## Levels: a abc b
df1$logi
## [1]  TRUE FALSE  TRUE
df1$fac2  # mixed types within a column is converted into a factor
## [1] 1    a    TRUE
## Levels: 1 a TRUE
# additional argument "stringsAsFactors = FALSE" preserves character types.
df2 <- data.frame(
        num  = c(1,2,3),           # read as numeric
        char = c("a","b","abc"),   # read as character
        logi = c(TRUE, FALSE, T),  # read as logical
        fac2  = as.factor(c(1,"a",TRUE)),      # read as factor
        stringsAsFactors = FALSE
       )  
df2
##   num char  logi fac2
## 1   1    a  TRUE    1
## 2   2    b FALSE    a
## 3   3  abc  TRUE TRUE
df2$num  
## [1] 1 2 3
df2$char
## [1] "a"   "b"   "abc"
df2$logi
## [1]  TRUE FALSE  TRUE
df2$fac2
## [1] 1    a    TRUE
## Levels: 1 a TRUE

3.2.2 Factor

A factor object is defined with a set of categorical levels, which may be labeled. The levels are either ordered (defined by ordered()) or unordered (defined by factor()). Ordered factor objects are treated in the specific order by certain statistical and graphical procedures.

# We will convert the columns of df into factors   
df <- data.frame(
      fac1 = c(0,1,1,4,4,2,2,3),
      fac2 = c(1,2,3,1,1,2,2,3),
      fac3 = c(4,2,3,4,4,2,2,3)
      )

# convert fac1 to ordered factors
df$fac1 <- ordered(df$fac1,
                  levels = c(0,4,3,2,1) # defines the order
                  ) 
df$fac1
## [1] 0 1 1 4 4 2 2 3
## Levels: 0 < 4 < 3 < 2 < 1
summary(df$fac1)  # gives the table of counts for each level
## 0 4 3 2 1 
## 1 2 1 2 2
# convert fac2 to unordered factors with labels
df$fac2 <- factor(df$fac2,
                  levels = c(1,2,3), #  no particular order
                  # attach labels to factors: 1=red, 2=blue, 3=green
                  labels = c("red", "blue", "green")
                  ) 
df$fac2
## [1] red   blue  green red   red   blue  blue  green
## Levels: red blue green
summary(df$fac2)                
##   red  blue green 
##     3     3     2
# convert fac3 to ordered factors with labels
df$fac3 <- ordered(df$fac3,
                  levels = c(2,3,4),
                  # attach labels to factors: 2=Low, 3=Medium, 4=High
                  labels = c("Low", "Medium", "High")
                  ) 
df$fac3
## [1] High   Low    Medium High   High   Low    Low    Medium
## Levels: Low < Medium < High
summary(df$fac3)
##    Low Medium   High 
##      3      2      3

3.2.3 Matrix

matrix() defines a matrix from a vector. The default is to arrange the vector by column (byrow = FALSE).

# byrow = FALSE  (the default)
matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = FALSE, dimnames = NULL)
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
# byrow = TRUE 
matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = TRUE, dimnames = NULL)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
# give row and column names to a matrix 
mat1 <- matrix(data = c(1:6), nrow = 2, ncol = 3, byrow = FALSE, 
       dimnames = list(c("r1","r2"), c("c1","c2","c3")))
mat1
##    c1 c2 c3
## r1  1  3  5
## r2  2  4  6
dim(mat1)  # dimension: row by column
## [1] 2 3
colnames(mat1)
## [1] "c1" "c2" "c3"
rownames(mat1)
## [1] "r1" "r2"
colnames(mat1) <- c("v1","v2","v3")  # change column names by assignment "<-" 
mat1
##    v1 v2 v3
## r1  1  3  5
## r2  2  4  6
# R makes a guess when only nrow or ncol is supplied
matrix(data = c(1:6), nrow = 2)
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
matrix(data = c(1:6), ncol = 3)
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
# combine matrices by column via "cbind()" or by row via "rbind()"
cbind(mat1,mat1)
##    v1 v2 v3 v1 v2 v3
## r1  1  3  5  1  3  5
## r2  2  4  6  2  4  6
rbind(mat1,mat1)
##    v1 v2 v3
## r1  1  3  5
## r2  2  4  6
## r1  1  3  5
## r2  2  4  6

There are recycling rules (which does/controls what?) in R.

# the vector shorter than the length of all elements of a matrix
matrix(data = c(1:4), nrow = 2, ncol= 3)
## Warning in matrix(data = c(1:4), nrow = 2, ncol = 3): data length [4] is
## not a sub-multiple or multiple of the number of columns [3]
##      [,1] [,2] [,3]
## [1,]    1    3    1
## [2,]    2    4    2
# R treats a scaler as a vector of length that conforms cbind() or rbind()  
cbind(mat1, colA = 1)
##    v1 v2 v3 colA
## r1  1  3  5    1
## r2  2  4  6    1
rbind(mat1, rowA= 1, rowB= 2, rowC= 3)
##      v1 v2 v3
## r1    1  3  5
## r2    2  4  6
## rowA  1  1  1
## rowB  2  2  2
## rowC  3  3  3

To replace elements of a matrix, we can use assignment operator <-.

mat1[1,1] <- 10
mat1
##    v1 v2 v3
## r1 10  3  5
## r2  2  4  6
mat1[,2] <- c(7,8)
mat1
##    v1 v2 v3
## r1 10  7  5
## r2  2  8  6
mat1[,1] <- 0  # recycling rule
mat1
##    v1 v2 v3
## r1  0  7  5
## r2  0  8  6

Matrix allows for easy extraction for rows and columns separated by comma.

mat1
##    v1 v2 v3
## r1  0  7  5
## r2  0  8  6
mat1[1, ]  # row = 1 and all columns
## v1 v2 v3 
##  0  7  5
mat1[, 1]  # all rows and col = 1 
## r1 r2 
##  0  0
mat1[c(TRUE,FALSE),]  # by a logical vector 
## v1 v2 v3 
##  0  7  5
mat1[, c(TRUE,FALSE)]
##    v1 v3
## r1  0  5
## r2  0  6
mat1[2,3]  # row = 2 and col = 3
## [1] 6
mat1[1:2, 2:3]  # row = 1:2 and col = 2:3
##    v2 v3
## r1  7  5
## r2  8  6
mat1[1:2, 2:3][2,2]  # subset of a subset
## [1] 6
mat1[, 1][2]  # vector extraction is done with one-dimensional index  
## r2 
##  0

Important: when a single row or column is extracted, it gets converted to a vector with no dimension.

mat1[1,]
## v1 v2 v3 
##  0  7  5
is.matrix(mat1[1, ])  
## [1] FALSE
dim(mat1[1,])
## NULL
length(mat1[1, ])  
## [1] 3
# to keep a row or column vector structure, use drop = FALSE
mat1[1,, drop = FALSE]
##    v1 v2 v3
## r1  0  7  5
is.matrix(mat1[1,,drop = FALSE])  
## [1] TRUE
dim(mat1[1,,drop = FALSE])
## [1] 1 3
length(mat1[1,,drop = FALSE])  
## [1] 3
mat1[,1, drop = FALSE]
##    v1
## r1  0
## r2  0
is.matrix(mat1[,1,drop = FALSE])  
## [1] TRUE
dim(mat1[,1,drop = FALSE])
## [1] 2 1
length(mat1[,1,drop = FALSE])  
## [1] 2

Another way of extraction from a matrix is to use row or column names.

mat1[,'v1']
## r1 r2 
##  0  0
mat1[,c('v1','v3')]
##    v1 v3
## r1  0  5
## r2  0  6
mat1['r2',,drop= FALSE]
##    v1 v2 v3
## r2  0  8  6

apply() applies a function for a specified margin (dimension index number) of the matrix.

mat1
##    v1 v2 v3
## r1  0  7  5
## r2  0  8  6
apply(mat1,1,mean)  # dimension 1 (across rows)
##       r1       r2 
## 4.000000 4.666667
apply(mat1,2,mean)  # dimension 2 (across columns)
##  v1  v2  v3 
## 0.0 7.5 5.5
# one can write a custom function inside apply(). (called annonymous function)    
# Its argument corresponds to the row or column vector passed by apply(). 
apply(mat1,2, function(x) sum(x)/length(x) )  # x is the internal vector name
##  v1  v2  v3 
## 0.0 7.5 5.5
ans1 <- apply(mat1,2, function(x) {   
                         avg = mean(x)
                         sd = sd(x)
                          # return the results as a list
                         list(Avg = avg, Sd = sd)
                      }
        )

unlist(ans1[[2]])  # results for the second column  
##       Avg        Sd 
## 7.5000000 0.7071068
unlist(ans1[[3]])  # results for the third column  
##       Avg        Sd 
## 5.5000000 0.7071068

Arrays are a generalization of matrices and can have more than 2 dimensions.

array(c(1:18), c(2,3,3))  # dimension 2 by 2 by 3
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   13   15   17
## [2,]   14   16   18
array(c(1:9), c(2,3,3))  # R recycles the vector 
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9    2
## [2,]    8    1    3
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]    4    6    8
## [2,]    5    7    9

3.2.4 Data Frame

A data frame is similar to a matrix, but it accepts multiple types (modes) of variables across columns (e.g., a dataset in typical data analysis programs like SAS, SPSS, Stata etc.). In some cases matrices and data frames may be treated interchangeably, but generally they need to be distinguished. Data manipulation functions are often written for data frames, while some base R functions are written for matrices.

mymat1 <- matrix(data = c(1:6), nrow = 2, ncol = 3, 
       dimnames = list(c("r1","r2"), c("c1","c2","c3"))) 
mymat1
##    c1 c2 c3
## r1  1  3  5
## r2  2  4  6
class(mymat1)
## [1] "matrix"
colnames(mymat1)
## [1] "c1" "c2" "c3"
names(mymat1)
## NULL
mydf1 <- data.frame(
                mymat1, 
                num  = c(1,2),         
                fac1 = c("a","abc"),    
                logi = c(TRUE, FALSE), 
                fac2  = c(1,"a")   
              )  
mydf1
##    c1 c2 c3 num fac1  logi fac2
## r1  1  3  5   1    a  TRUE    1
## r2  2  4  6   2  abc FALSE    a
class(mydf1)
## [1] "data.frame"
colnames(mydf1)
## [1] "c1"   "c2"   "c3"   "num"  "fac1" "logi" "fac2"
names(mydf1)   # colnames and names are the same 
## [1] "c1"   "c2"   "c3"   "num"  "fac1" "logi" "fac2"

Extracting elements from a data frame is similar to extracting from a matrix, but there are a few additional methods.

mydf1[1,]   # row = 1 and all columns 
##    c1 c2 c3 num fac1 logi fac2
## r1  1  3  5   1    a TRUE    1
mydf1[,1]   # all rows and col = 1
## [1] 1 2
# data frame preserves dimension while extracting a row but not a column
dim(mydf1[1,])  
## [1] 1 7
dim(mydf1[,1])  
## NULL
dim(mydf1[,1,drop=FALSE])   # use drop = FALSE to keep a column vector
## [1] 2 1
mydf1[,1,drop=FALSE]
##    c1
## r1  1
## r2  2
mydf1[, c('c1','num','logi')]
##    c1 num  logi
## r1  1   1  TRUE
## r2  2   2 FALSE
class(mydf1[, c('c1','num','logi')])
## [1] "data.frame"
# extraction by column name with "$" symbol:  df$varname  
mydf1$c1
## [1] 1 2
dim(mydf1$c1)
## NULL
# one can use quote ' ' or " " as well 
mydf1$'c1'
## [1] 1 2
# similarly, extraction by column name with [[ ]]: df[['varname']]    
mydf1[['c1']]
## [1] 1 2
dim(mydf1[['c1']])
## NULL
# or by index 
mydf1[[1]]
## [1] 1 2
# [[ ]] method is useful when passing a variable name as a string
set_to_na <- function(df, var) {
                df[[var]] <- NA  
                df 
              }
mydf1
##    c1 c2 c3 num fac1  logi fac2
## r1  1  3  5   1    a  TRUE    1
## r2  2  4  6   2  abc FALSE    a
mydf2 <- set_to_na(mydf1, "c2") 
mydf2
##    c1 c2 c3 num fac1  logi fac2
## r1  1 NA  5   1    a  TRUE    1
## r2  2 NA  6   2  abc FALSE    a
# add a variable 
mydf1$newvar <- c(4, 4)
mydf1
##    c1 c2 c3 num fac1  logi fac2 newvar
## r1  1  3  5   1    a  TRUE    1      4
## r2  2  4  6   2  abc FALSE    a      4
mydf1$newvar2 <- mydf1$c2 + mydf1$c3
mydf1
##    c1 c2 c3 num fac1  logi fac2 newvar newvar2
## r1  1  3  5   1    a  TRUE    1      4       8
## r2  2  4  6   2  abc FALSE    a      4      10

apply() may not work well with data frames since data frames are not exactly matrices. We can use simplified apply sapply() or list apply lapply() instead.

mydf1 
##    c1 c2 c3 num fac1  logi fac2 newvar newvar2
## r1  1  3  5   1    a  TRUE    1      4       8
## r2  2  4  6   2  abc FALSE    a      4      10
# sapply() 
idx_num <- sapply(mydf1, is.numeric) 
idx_num
##      c1      c2      c3     num    fac1    logi    fac2  newvar newvar2 
##    TRUE    TRUE    TRUE    TRUE   FALSE   FALSE   FALSE    TRUE    TRUE
apply(mydf1[,idx_num], 2, mean)
##      c1      c2      c3     num  newvar newvar2 
##     1.5     3.5     5.5     1.5     4.0     9.0
sapply(mydf1[,idx_num], mean)
##      c1      c2      c3     num  newvar newvar2 
##     1.5     3.5     5.5     1.5     4.0     9.0
# lapply() 
idx_num2 <- unlist(lapply(mydf1, is.numeric)) 
idx_num2
##      c1      c2      c3     num    fac1    logi    fac2  newvar newvar2 
##    TRUE    TRUE    TRUE    TRUE   FALSE   FALSE   FALSE    TRUE    TRUE
unlist(lapply(mydf1[,idx_num2], mean))
##      c1      c2      c3     num  newvar newvar2 
##     1.5     3.5     5.5     1.5     4.0     9.0

3.2.5 List

A list is an ordered collection of (possibly unrelated) objects. The objects in a list are referenced by [[1]], [[2]], …, or [[‘var1’]], [[‘var2’]], … etc.

mylist1 <- list(v1 = c(1,2,3),
             v2 = c("a","b"),
             v3 = factor(c("blue","red","orange","yellow")),
             v4 = data.frame( u1 = c(1:3), u2 = c("p","q","r"))
             )
mylist1 
## $v1
## [1] 1 2 3
## 
## $v2
## [1] "a" "b"
## 
## $v3
## [1] blue   red    orange yellow
## Levels: blue orange red yellow
## 
## $v4
##   u1 u2
## 1  1  p
## 2  2  q
## 3  3  r
# extraction
mylist1[[1]]             
## [1] 1 2 3
mylist1[["v2"]]             
## [1] "a" "b"
mylist1$v3      
## [1] blue   red    orange yellow
## Levels: blue orange red yellow
mylist1$v4$u2
## [1] p q r
## Levels: p q r
# assignment
mylist1$v5 <- c("a",NA)
mylist1$v5 
## [1] "a" NA
# a list can be nested 
mylist1$v6 <- list(y1 = c(2,9), y2 = c(0,0,0,1))
mylist1$v6
## $y1
## [1] 2 9
## 
## $y2
## [1] 0 0 0 1

lapply() is very versatile since the items in a list can be completely unrelated.

unlist(lapply(mylist1, class))
##           v1           v2           v3           v4           v5 
##    "numeric"  "character"     "factor" "data.frame"  "character" 
##           v6 
##       "list"
unlist(lapply(mylist1, attributes))  # some variables have attributes
##    v3.levels1    v3.levels2    v3.levels3    v3.levels4      v3.class 
##        "blue"      "orange"         "red"      "yellow"      "factor" 
##     v4.names1     v4.names2 v4.row.names1 v4.row.names2 v4.row.names3 
##          "u1"          "u2"           "1"           "2"           "3" 
##      v4.class     v6.names1     v6.names2 
##  "data.frame"          "y1"          "y2"
lapply(mylist1, function(x) {
                  if (is.numeric(x)) return(summary(x))
                  if (is.character(x)) return(x)
                  if (is.factor(x)) return(table(x))
                  if (is.data.frame(x)) return(head(x))
                  if (is.list(x)) return(unlist(lapply(x,class)))
                }
)
## $v1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.5     2.0     2.0     2.5     3.0 
## 
## $v2
## [1] "a" "b"
## 
## $v3
## x
##   blue orange    red yellow 
##      1      1      1      1 
## 
## $v4
##   u1 u2
## 1  1  p
## 2  2  q
## 3  3  r
## 
## $v5
## [1] "a" NA 
## 
## $v6
##        y1        y2 
## "numeric" "numeric"