Tutorial

# In R:
# Load iPsychCNV package
library(iPsychCNV)

Run iPsychCNV Package

## Creates a mock data that simulates Infinium PsychArray BeadChip from Illumina. ## Mock data uses same SNP position as PsychArray chip. MockCNVs <- MockData(N=1, Type="Blood", Cores=1) # Running iPsychCNV on mock data. CNVs <- iPsychCNV(PathRawData=".", Cores=1, Pattern="^MockSample*", Skip=0) # PathRawData: Data path. # Cores = 1, number of CPUs to be used. # Pattern = "^MockSample*". Scan for the pattern in a given folder.

EVALUATE PERFORMANCE

# Test and evaluate three methods using long mock data. CNV.Prediction <- RunLongMock(Name="All", Method="All",
HMM="/media/NeoScreen/NeSc_home/share/Programs/penncnv/lib/hhall.hmm",
Path2PennCNV="/media/NeoScreen/NeSc_home/share/Programs/penncnv/")

iPsychCNV prediction.

Gada prediction.

PennCNV prediction PFB 0.5 for each SNP (PennCNV NO).

PennCNV prediction PFB calculated for each SNP (PennCNV NA).

ROC curve.


HOTSPOTS

# Creating a mock data. MockCNVs <- MockData(N=10, Type="Blood", Cores=10) # MockCNVs data.frame format (detail information about CNVs in each sample): str(MockCNVs) 'data.frame': 5480 obs. of 15 variables: $ Start : int 752566 6530391 7912975 14210730 16847832 20645925 21009279 27210667 29763684 35824753 ... $ Stop : int 6530391 7912975 14210730 16847832 20645925 21009279 27210667 29763684 35824753 36557619 ... $ StartIndx : num 1 2000 2400 4000 4800 6000 6150 8000 8600 10000 ... $ StopIndx : num 2000 2400 4000 4800 6000 6150 8000 8600 10000 10200 ... $ NumSNPs : num 1999 400 1600 800 1200 ... $ Chr : int 1 1 1 1 1 1 1 1 1 1 ... $ CNVmean : num 0 0.35 0 -0.35 0 -0.4 0 0.5 0 -0.35 ... $ CN : num 2 4 2 1 2 0 2 3 2 1 ... $ sd : num 0.2 0.15 0.2 0.15 0.2 0.15 0.2 0.15 0.2 0.15 ... $ ID : chr "MockSample_1.tab" "MockSample_1.tab" "MockSample_1.tab" "MockSample_1.tab" ... $ NumCNVs : num 25 25 25 25 25 25 25 25 25 25 ... $ ChrMean : num 0.0456 0.0456 0.0456 0.0456 0.0456 ... $ Length : int 5777825 1382584 6297755 2637102 3798093 363354 6201388 2553017 6061069 732866 ... $ CNVID : int 1 1 2 2 3 3 4 4 5 5 ... $ PositionID: chr "1_2000" "2000_2400" "2400_4000" "4000_4800" ... # Predicting CNVs using iPsychCNV CNVs <- iPsychCNV(PathRawData=".", Pattern="^MockSample*", Skip=0) CNVs.Good <- subset(CNVs, CN != 2) # keep only CNVs with CN = 0, 1, 3, 4. # iPsychCNV results data.frame format: str(CNVs.Good) 'data.frame': 2579 obs. of 56 variables: $ Start : int 95947234 6530372 14209336 20645874 27190196 35824706 43672394 53164220 62253546 74174778 ... $ Stop : int 98808760 7912975 16847832 21009279 29763684 36557619 46094037 54546449 62676947 74737345 ... $ CNVMean : num -0.279 0.259 -0.245 -0.283 0.426 ... $ StartIndx : num 21999 1999 3999 5999 7999 ... $ StopIndx : num 22400 2400 4800 6150 8600 ... $ Length : int 2861526 1382603 2638496 363405 2573488 732913 2421643 1382229 423401 562567 ... $ NumSNPs : num 401 401 801 151 601 201 801 401 151 101 ... $ Chr : int 1 1 1 1 1 1 1 1 1 1 ... $ ID : chr "MockSample_1.tab" "MockSample_1.tab" "MockSample_1.tab" "MockSample_1.tab" ... $ CN : num 0 4 1 0 3 1 4 4 1 3 ... $ CNVID : chr "1" "2" "3" "4" ... $ AAAA : num 12.9 37.8 48 13.2 42.7 49 35 37.1 55.9 41.2 ... $ AAAB : num 19.2 0 0.1 19.7 0 0 0 0 0 0 ... $ AAB : num 16.2 8.5 0 10.5 9.6 0 7.9 7.5 0 8.8 ... $ AB : num 14.7 8.2 0 19.7 0 0 9.6 13.4 0 0 ... $ ABB : num 13.4 8.2 0 10.5 8.1 0 8.9 9 0 14.7 ... $ ABBB : num 14.9 0 0 18.4 0 0 0 0 0 0 ... $ BBBB : num 8.7 37.3 51.9 7.9 39.5 51 38.7 33.1 44.1 35.3 ... $ UsedBAF : num 100 100 100 100 100 100 100 100 100 100 ... $ CNVmean : num -0.285 0.259 -0.245 -0.283 0.426 ... $ HighMean : num 0.00403 0.01017 0.00521 0.00845 -0.00302 ... $ LowMean : num 0.010138 0.001656 0.01133 0.011527 -0.000432 ... $ CNVmeanOrig : num -0.373 0.374 -0.325 -0.371 0.524 ... $ SDCNV : num 0.029 0.0256 0.0229 0.0386 0.0285 ... $ SDMeansCNV : num 0.00742 0.0061 0.00549 0.01332 0.00549 ... $ SDHigh : num 0.111 0.112 0.112 0.115 0.114 ... $ SDLow : num 0.109 0.114 0.111 0.11 0.114 ... $ VarianceCNV : num 0.00084 0.000657 0.000525 0.001488 0.000812 ... $ DiffHigh : num 0.289 0.249 0.251 0.291 0.429 ... $ DiffLow : num 0.295 0.257 0.257 0.294 0.426 ... $ Wave.Length.Mean: num 3.04 2.91 2.96 3.13 2.9 ... $ Wave.Length.SD : num 1.084 0.956 1.098 1.169 0.924 ... $ Amplitude.Mean : num 0.001397 -0.001268 0.000632 0.004513 -0.001248 ... $ Amplitude.SD : num 0.0475 0.0431 0.041 0.0573 0.0465 ... $ CNV2HighPvalue : num 4.77e-188 1.29e-158 3.63e-318 3.65e-71 0.00 ... $ CNV2LowPvalue : num 5.93e-194 3.97e-161 0.00 4.25e-75 0.00 ... $ BAlleleFreq : chr "Undefined" "Undefined" "1" "Undefined" ... $ LogRRatio : num 1 3 1 1 3 1 3 3 1 3 ... $ SumPeaks : int 1 3 2 3 2 2 3 3 2 2 ... $ MyBAF : num 0 4 1 0 3 1 4 4 1 3 ... $ SDChr : num 0.196 0.196 0.196 0.196 0.196 ... $ MeanChr : num 0.0188 0.0188 0.0188 0.0188 0.0188 ... $ Source : chr "iPsychCNV" "iPsychCNV" "iPsychCNV" "iPsychCNV" ... # Searching for hotspots CNVs.Hotspots <- HotspotsCNV(df=CNVs.Good, Freq=1, OverlapCutoff=0.7, Cores=22) # Hotspots data.frame format: str(CNVs.Hotspots) 'data.frame': 260 obs. of 18 variables: $ Chr : num 1 1 1 1 1 1 1 1 1 1 ... $ Start : num 1.10e+08 1.17e+08 1.42e+07 1.51e+08 1.55e+08 ... $ Stop : num 1.12e+08 1.18e+08 1.68e+07 1.52e+08 1.57e+08 ... $ Length : num 2658481 1794999 2638496 1307606 1952779 ... $ Count : num 10 8 9 10 10 10 8 10 9 8 ... $ Source : chr "iPsychCNV" "iPsychCNV" "iPsychCNV" "iPsychCNV" ... $ NewName : chr "1_109650542_112309023_2658481" "1_116691120_118486119_1794999"... $ CHR : chr "chr1" "chr1" "chr1" "chr1" ... $ CNV_Start: num 1.10e+08 1.17e+08 1.42e+07 1.51e+08 1.55e+08 ... $ CNV_Stop : num 1.12e+08 1.18e+08 1.68e+07 1.52e+08 1.57e+08 ... $ locus : chr " 1p13.3-p13.2" " 1p13.1-p12" " 1p36.21-p36.13" " 1q21.3" ... $ ID : chr "ROI" "ROI" "ROI" "ROI" ... $ Class : chr "ROI" "ROI" "ROI" "ROI" ... $ CN0 : num 4 2 4 2 1 4 3 4 4 0 ... $ CN1 : num 1 3 6 1 4 2 3 3 2 3 ... $ CN3 : num 2 1 0 1 3 3 3 1 2 4 ... $ CN4 : num 3 4 0 6 2 1 0 2 1 2 ... $ CNTotal : num 10 10 10 10 10 10 9 10 9 9 ... # Ploting CNVs and Hotspots PlotAllCNVs(CNVs.Good, Name="CNVs.Hotspots.png", Roi=CNVs.Hotspots)

CNV prediction from iPsychCNV and hotspots.

# Plotting individual CNVs from a data.frame
PlotCNVsFromDataFrame(DF=CNVs.Good[1:4,], PathRawData=".", Cores=1, Skip=0, Pattern="^MockSamples*")


Example of mock CNV with CN = 0.

Example of mock CNV with CN = 1.

Example of mock CNV with CN = 3.

Example of mock CNV with CN = 4.


Long Mock

# Long mock has only one chromosome, and doesn't use PsychArray chip information. # It is created to test combination of LRR and BAF found in DBS data. # Input for long mock. More variables can be included. # For example: Size=c(100, 300, 600, 2000) CNVDistance=1000 # Distance among CNVs. Type=c(0,1,2,3,4) # CNV copy number. Mean=c(-0.6, -0.3, 0.3, 0.6) # CNV LRR mean. Size=c(300, 600) # CNV number of SNPs. Method = "iPsychCNV" # Method used Name="LongMock" # CNVMean=0.3 # If method has no CNV mean output. Name <- paste(Name,"_",Method, "_LongMockResult.png", sep="", collapse="") # png plot name. # Running long mock step by step LongRoi <- MakeLongMockSample(CNVDistance, Type, Mean, Size) # Reading long mock to an object in R. Sample <- read.table("LongMockSample.tab", sep="\t", header=TRUE, stringsAsFactors=F) # Running iPsychCNV on long mock data. PredictedCNV <- iPsychCNV(PathRawData=".", Pattern="^LongMockSample.tab$", Skip=0) # Plotting LRR and BAF from PlotLRRAndCNVs(PredictedCNV, Sample, CNVMean, Name=Name, Roi=LongRoi) # Evaluating CNVs CNVs.Eval <- EvaluateMockResults(LongRoi, PredictedCNV) # Print ROC curve. plot.roc(CNVs.Eval$CNV.Predicted, CNVs.Eval$CNV.Present, percent=TRUE, col="#1c61b6", print.auc=TRUE)

iPSYCHCNV STEP By STEP

# Function to read sample. Mock.chr22 <- ReadSample(RawFile="MockSample_1.tab", skip=0, LCR=FALSE, chr=22) # Function to find CNVs. Mock.chr22.CNVs <- FindCNV.V4(ID="Test", MinNumSNPs=10, CNV=Mock.chr22) # Function to validate CNVs. Mock.chr22.CNVs.validation <- FilterCNVs.V4(CNVs=Mock.chr22.CNVs, MinNumSNPs=10, CNV=Mock.chr22, ID="Test")

Merge CNVs

# Using low penvalue to break CNV prediction. Mock.chr22.CNVs <- FindCNV.V4(ID="Test", MinNumSNPs=10, CNV=Mock.chr22, CPTmethod="meanvar", penvalue=10) # Second and Third CNV should be merged. Gap of only 11 SNPs or 47171 bp. str(Mock.chr22.CNVs) 'data.frame': 5 obs. of 10 variables: $ Start : int 24583365 32009612 33430734 38864287 45309851 $ Stop : int 25938025 33383563 35786182 39385595 46854398 $ CNVMean : num -0.454 0.387 0.391 0.428 0.379 $ StartIndx: num 1999 4006 4424 5999 7999 $ StopIndx : num 2400 4413 4800 6139 8600 $ Length : int 1354660 1373951 2355448 521308 1544547 $ NumSNPs : num 401 407 376 140 601 $ Chr : int 22 22 22 22 22 $ ID : chr "Test" "Test" "Test" "Test" ... $ CN : num 1 3 3 3 3 Mock.chr22.CNVs.merged <- MergeCNVs(Mock.chr22.CNVs, MaxNumSNPs=50) str(Mock.chr22.CNVs.merged) 'data.frame': 4 obs. of 10 variables: $ Start : int 24583365 32009612 38864287 45309851 $ Stop : int 25938025 35786182 39385595 46854398 $ StartIndx: num 1999 4006 5999 7999 $ StopIndx : num 2400 4800 6139 8600 $ Length : int 1354660 3776570 521308 1544547 $ NumSNPs : num 401 794 140 601 $ CNVMean : num -0.454 0.387 0.428 0.379 $ Chr : int 22 22 22 22 $ ID : chr "Test" "Test" "Test" "Test" $ CN : num 1 3 3 3

Filter CNVs from other methods

# It is possible to validate CNVs using prediction from other methods. # For example, if you have PennCNV results it is possible to run the filter to collect CNV variables #(that are required to run support vector machine) and get a preliminary evaluation. LongRoi <- MakeLongMockSample(Mean=c(-0.6, -0.3, 0.3, 0.6), Size=c(200, 400, 600)) # GADA Sample <- read.table("LongMockSample.tab", sep="\t", header=TRUE, stringsAsFactors=F) Gada <- RunGada(Sample) Gada$ID <- "LongMockSample.tab" PlotLRRAndCNVs(Gada, tmp=Sample, Name="GadaLong.png", CNVMean=0.3, Roi=LongRoi)
Gada prediction on long mock.
Gada.filter <- FilterFromCNVs(CNVs=Gada, PathRawData=".", MinNumSNPs=10, Source="Gada", Skip=0, Cores=1) PlotLRRAndCNVs(Gada.filter, tmp=Sample, Name="Gada.filter.Long.png", CNVMean=0.3, Roi=LongRoi)
Gada filtered prediction on long mock.
# PennCNV LongRoi <- MakeLongMockSample(Mean=c(-0.6, -0.3, 0.3, 0.6), Size=c(200, 400, 600)) Sample <- read.table("LongMockSample.tab", sep="\t", header=TRUE, stringsAsFactors=F) CNVs <- RunPennCNV(PathRawData=".", Pattern="^LongMockSample.tab$", Skip=0, Path2PennCNV="/home/programs/penncnv.tardis/penncnv/", HMM="/home/programs/penncnv.tardis/penncnv/lib/hhall.hmm") PlotLRRAndCNVs(CNV=CNVs, Sample=Sample, Name="PennCNVLong.png", Roi=LongRoi)
PennCNV prediction on long mock.
PennCNV.filter <- FilterFromCNVs(CNVs=CNVs, PathRawData=".", MinNumSNPs=10, Source="PennCNV", Skip=0, Cores=1) PlotLRRAndCNVs(CNV=PennCNV.filter, Sample=Sample, Name="PennCNV.Long.filter.png", Roi=LongRoi)
PennCNV filtered prediction on long mock.

Challenges

Amplified DNA from dried blood spots offers number of challenges for copy number of variation detection. Here we describe some of the challenges one can find working with DBS data.

Tools

iPsychCNV package offers a series of tools that can be used in the CNV prediction pipeline, but also independent with other programs.

Classification

Evaluation of CNV prediction performance is an important step for methods comparison. Here we describe how binary classification is used to evaluate the method performance.

Methods

iPsychCNV uses many different methods to perform a series of functions. Here we describe in detail the methods used by iPsychCNV.

  • Github

    iPsychCNV is an open source R package project. People are welcome to give suggestions, code new functions and/or improve existing ones. The source code is available at Github .

  • About iPsychCNV

    iPsychCNV is a method to find copy number variation from amplified DNA from dried blood spots on Illumina SNP array. It is designed to handle large variation on Log R ratio, and uses B allele frequency to improve CNV calls. iPsychCNV is an open source project on Github

  • About iPSYCH

    The project will study five specific mental disorders; autism, ADHD, schizophrenia, bipolar disorder and depression. All disorders are associated with major human and societal costs all over the world. The iPSYCH project will study these disorders from many different angles, ranging from genes and cells to population studies, from fetus to adult, from cause to symptoms of the disorder, and this knowledge will be combined in new ways across scientific fields, visit iPSYCH.