The package can be installed via Bioconductor, as shown below:
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("rSWeeP")
Consider a set of 13 mitochondrial proteomes (translated CDSs)
deposited in the folder at the address path
in FASTA
format. These sequences can be vectorized
library(rSWeeP)
path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')
sw = SWeePlite(path,seqtype='AA',mask=c(4),psz=1000)
In sw$proj
you will find the SWeeP vectors in matrix
format with 13 rows (each row a sample) and 1000 columns (coordinates).
In sw$info
other processing information is stored, which
may be important in subsequent steps.
sw$info
## $ProjectionSize
## [1] 1000
##
## $mask
## [1] 1 1 1 1
##
## $SequenceType
## [1] "AA"
##
## $concatenate
## [1] FALSE
##
## $bin
## [1] "counting (FALSE)"
##
## $version
## [1] "SWeePlite v2.8"
##
## $norm
## [1] "none"
##
## $timeElapsed
## [1] 5.452
##
## $headers
## [1] "01_Pan_troglodytes" "02_Capra_aegagrus"
## [3] "03_Homo_sapiens" "04_Bos_taurus"
## [5] "05_Ara_ararauna" "06_Mus_musculus"
## [7] "07_Brotogeris_cyanoptera" "08_Homo_sapiens_neanderthalensis"
## [9] "09_Gazella_gazella" "10_Rattus_norvegicus"
## [11] "11_Pan_paniscus" "12_Psittacara_rubritorquis"
## [13] "13_Apodemus_sylvaticus" "14_Rhazya_stricta"
$ProjectionSize
- length of the output vector (length
of the projection)$mask
- mask used (provided in extended binary
format)$SequenceType
- data type (amino acid or
nucleotide)$concatenate
- if the files were concatenated into
one$bin
- whether binary (TRUE) or count (FALSE)$version
- version of rSWeeP used$norm
- type of normalization used (‘none’, ‘log’ or
‘logNeg’)$timeElapsed
- time spent in processing$headers
- name of the vectorized files/samplesTo visualize the vectorized data, we provide the metadata with the classes at the family taxonomic level.
pathmetadata <- system.file(package = "rSWeeP" , "examples" , "metadata_mitochondrial.csv")
mt = read.csv(pathmetadata,header=TRUE)
vunq = unique(mt$family) # list of unique families
ncl = length(vunq) # number of families
cl = mt$id # numeric classes
We obtain the PCA and visualize the first components.
pca_output <- prcomp (sw$proj , scale = FALSE)
par(mfrow=c(1,2))
plot(pca_output$x[,1] , pca_output$x [,2] , xlab = 'PC-1' , ylab = 'PC-2' , pch =20 , col = cl)
legend("bottomright" , vunq , col = as.character(c(1:ncl)) , pch =20)
plot(pca_output$x[,3] , pca_output$x [,4] , xlab = 'PC-3' , ylab = 'PC-4' , pch =20 , col = cl)
We obtain the tSNE with two dimensions.
library(Rtsne)
tsne_output <- Rtsne::Rtsne (sw$proj , dims =2 , pca = FALSE , perplexity =2 , dist = euclidean)
plot(tsne_output$Y[,1] , tsne_output$Y [,2] , xlab = 'tSNE-1' , ylab = 'tSNE-2' , pch =20,col=cl)
legend("topleft" , vunq , col = as.character(c(1:ncl)) , pch =20)
The heatmap makes the related groups in the data set visible, based on the Euclidean distance matrix. These groups coincide with taxonomic families. Black indicates greater similarity, and white no similarity.
data = sw$proj
rownames(data) = mt$family
mdist = dist(data,method='euclidean')
heatmap(as.matrix(mdist),symm=T,col=colorRampPalette(c('#000000','#FFFFFF'))(10))
We can also obtain the phylogenetic relationship between the vectorized organisms using the Neighbour Joining (NJ) method.
library(ape)
library(ggtree)
# get the distance matrix
mdist = dist(sw$proj,method='euclidean')
# use the NJ algorithm to build the tree
tr = nj(mdist)
# root the tree in the plant sample
tr = root(tr,outgroup='14_Rhazya_stricta')
# plot
p<- ggtree(tr)+ geom_tiplab()
p
Naming according to family
library(ape)
library(ggtree)
# Rename the data to display the grouping by family
data = sw$proj
rownames(data) = mt$family
# get the distance matrix
mdist = dist(data,method='euclidean')
# use the NJ algorithm to build the tree
tr = nj(mdist)
# root the tree in the plant sample
tr = root(tr,outgroup='Apocynaceae')
# plot
p<- ggtree(tr)+ geom_tiplab()
p
More information can be found in the package manual.
?rSWeeP