The package can be installed via Bioconductor, as shown below:

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("rSWeeP")

1. SweeP of mithocondrial proteome

Consider a set of 13 mitochondrial proteomes (translated CDSs) deposited in the folder at the address path in FASTA format. These sequences can be vectorized

library(rSWeeP)
path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')
sw = SWeePlite(path,seqtype='AA',mask=c(4),psz=1000)

What does the SWeeP object look like?

In sw$proj you will find the SWeeP vectors in matrix format with 13 rows (each row a sample) and 1000 columns (coordinates). In sw$info other processing information is stored, which may be important in subsequent steps.

sw$info

## $ProjectionSize
## [1] 1000
## 
## $mask
## [1] 1 1 1 1
## 
## $SequenceType
## [1] "AA"
## 
## $concatenate
## [1] FALSE
## 
## $bin
## [1] "counting (FALSE)"
## 
## $version
## [1] "SWeePlite v2.8"
## 
## $norm
## [1] "none"
## 
## $timeElapsed
## [1] 5.452
## 
## $headers
##  [1] "01_Pan_troglodytes"               "02_Capra_aegagrus"               
##  [3] "03_Homo_sapiens"                  "04_Bos_taurus"                   
##  [5] "05_Ara_ararauna"                  "06_Mus_musculus"                 
##  [7] "07_Brotogeris_cyanoptera"         "08_Homo_sapiens_neanderthalensis"
##  [9] "09_Gazella_gazella"               "10_Rattus_norvegicus"            
## [11] "11_Pan_paniscus"                  "12_Psittacara_rubritorquis"      
## [13] "13_Apodemus_sylvaticus"           "14_Rhazya_stricta"

$ProjectionSize - length of the output vector (length of the projection)
$mask - mask used (provided in extended binary format)
$SequenceType - data type (amino acid or nucleotide)
$concatenate - if the files were concatenated into one
$bin - whether binary (TRUE) or count (FALSE)
$version - version of rSWeeP used
$norm - type of normalization used (‘none’, ‘log’ or ‘logNeg’)
$timeElapsed - time spent in processing
$headers - name of the vectorized files/samples

2. Data visualization in graphs

To visualize the vectorized data, we provide the metadata with the classes at the family taxonomic level.

pathmetadata <- system.file(package = "rSWeeP" , "examples" , "metadata_mitochondrial.csv")
mt = read.csv(pathmetadata,header=TRUE)

vunq = unique(mt$family) # list of unique families
ncl = length(vunq) # number of families
cl = mt$id         # numeric classes

2.1. Principal Component Analysis (PCA )

We obtain the PCA and visualize the first components.

pca_output <- prcomp (sw$proj , scale = FALSE)
par(mfrow=c(1,2))
plot(pca_output$x[,1] , pca_output$x [,2] , xlab = 'PC-1' , ylab = 'PC-2' , pch =20 , col = cl)
legend("bottomright" , vunq , col = as.character(c(1:ncl)) , pch =20)
plot(pca_output$x[,3] , pca_output$x [,4] , xlab = 'PC-3' , ylab = 'PC-4' , pch =20 , col = cl)

2.2. tSNE

We obtain the tSNE with two dimensions.

library(Rtsne)
tsne_output <- Rtsne::Rtsne (sw$proj , dims =2 , pca = FALSE , perplexity =2 , dist = euclidean)
plot(tsne_output$Y[,1] , tsne_output$Y [,2] , xlab = 'tSNE-1' , ylab = 'tSNE-2' , pch =20,col=cl)
legend("topleft" , vunq , col = as.character(c(1:ncl)) , pch =20)

2.3. Heatmap

The heatmap makes the related groups in the data set visible, based on the Euclidean distance matrix. These groups coincide with taxonomic families. Black indicates greater similarity, and white no similarity.

data = sw$proj
rownames(data) = mt$family
mdist = dist(data,method='euclidean')
heatmap(as.matrix(mdist),symm=T,col=colorRampPalette(c('#000000','#FFFFFF'))(10))

2.4. Phylogenetic relationship

We can also obtain the phylogenetic relationship between the vectorized organisms using the Neighbour Joining (NJ) method.

library(ape)
library(ggtree)
# get the distance matrix
mdist = dist(sw$proj,method='euclidean')
# use the NJ algorithm to build the tree
tr = nj(mdist)
# root the tree in the plant sample
tr = root(tr,outgroup='14_Rhazya_stricta')
# plot
p<- ggtree(tr)+ geom_tiplab()
p

Naming according to family

library(ape)
library(ggtree)
# Rename the data to display the grouping by family
data = sw$proj
rownames(data) = mt$family
# get the distance matrix
mdist = dist(data,method='euclidean')
# use the NJ algorithm to build the tree
tr = nj(mdist)
# root the tree in the plant sample
tr = root(tr,outgroup='Apocynaceae')
# plot
p<- ggtree(tr)+ geom_tiplab()
p

More information can be found in the package manual.

?rSWeeP

rSWeeP - Quick Start

Camila Pereira Perico

2024-07-13