Skip to contents
library(qzwslrm)

Background

As described in Legarra and Reverter (2018), validation of predicted breeding values is becoming more and more important. Their work uses a cross-validation based approach using results of successive genetic evaluations. These results consist of \(\hat{u}_p\) and \(\hat{u}_w\) which are vectors of the same length and contain predicted breeding values based on “partial” (\(p\)) and “whole” (\(w\)) data, respectively.

Cross-Validation Statistics

The following statistics are computed based on the input \(\hat{u}_p\) and \(\hat{u}_w\)

  • bias: \(\overline{\hat{u}_p} - \overline{\hat{u}_w}\)
  • regression of whole on partial: \(b_{w,p} = \frac{cov(\hat{u}_w,\hat{u}_p)}{var(\hat{u}_p)}\)
  • correlation between whole and partial: \(r_{w,p} = \frac{cov(\hat{u}_w,\hat{u}_p)}{\sqrt{var(\hat{u}_p)var(\hat{u}_w)}}\)
  • regression of partial on whole: \(b_{p,w} = \frac{cov(\hat{u}_w,\hat{u}_p)}{var(\hat{u}_w)}\)

Example Data

As shown in https://fbzwsqualitasag.github.io/qzwslrm/articles/qzwslrm_ebv_mix99.html, example data are prepared to run the most basic validation statistics. The example data consists of two results files from MiX99 containing predicted breeding values. The first result file contains EBV predicted based on a complete dataset.

(s_ebv_path_whole <- qzwslrm_example_solani("whole"))
#> [1] "/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani"

The second result file contains EBV predicted for the same animals but based on a partial dataset. For our example, the phenotypic observations of all animals from the latest generation were set to missing (NA).

(s_ebv_path_partial <- qzwslrm_example_solani("partial"))
#> [1] "/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani"

These files can be read using the function read_solani()

(tbl_solani_whole <- readr_solani(ps_path = s_ebv_path_whole))
#> Warning: The following named parsers don't match the column names: nprg
#> Warning: 3844 parsing failures.
#> row col  expected    actual                                                                             file
#>   1  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   2  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   3  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   4  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   6  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#> ... ... ......... ......... ................................................................................
#> See problems(...) for more details.
#> # A tibble: 4,995 × 4
#>    animal nrprg trait      ebv
#>     <int> <dbl> <int>    <dbl>
#>  1      1    10     1 -0.214  
#>  2      2    30     1  0.110  
#>  3      3    90     1  0.485  
#>  4      4    10     1 -0.154  
#>  5      5    20     1  0.0766 
#>  6      6    20     1 -0.135  
#>  7      7    20     1 -0.0928 
#>  8      8    10     1 -0.00596
#>  9      9    10     1 -0.0130 
#> 10     10    60     1  0.181  
#> # … with 4,985 more rows

The same can be done with the partial data

(tbl_solani_partial <- readr_solani(ps_path = s_ebv_path_partial))
#> Warning: The following named parsers don't match the column names: nprg
#> Warning: 3839 parsing failures.
#> row col  expected    actual                                                                               file
#>   1  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   2  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   3  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   4  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   6  -- 4 columns 5 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#> ... ... ......... ......... ..................................................................................
#> See problems(...) for more details.
#> # A tibble: 4,995 × 4
#>    animal nrprg trait      ebv
#>     <int> <dbl> <int>    <dbl>
#>  1      1    10     1 -0.215  
#>  2      2    30     1  0.129  
#>  3      3    90     1  0.463  
#>  4      4    10     1 -0.154  
#>  5      5    20     1  0.0779 
#>  6      6    20     1 -0.136  
#>  7      7    20     1 -0.0913 
#>  8      8    10     1 -0.00620
#>  9      9    10     1 -0.0198 
#> 10     10    60     1  0.170  
#> # … with 4,985 more rows

Validation

For the validation only the EBV vectors from the full and the partial data are required. Hence the validation function only needs the two vectors as input.

(l_val_result <- val_ebv_lrm(pvec_ebv_partial = tbl_solani_partial$ebv, 
                             pvec_ebv_whole = tbl_solani_whole$ebv))
#> $bias
#> [1] -0.006104794
#> 
#> $reg_wop
#> [1] 1.037712
#> 
#> $cor_wp
#> [1] 0.987303
#> 
#> $reg_pow
#> [1] 0.9393427

Other Input Formats

For reasons of flexibility, EBV can be read from files having a number of different formats such as ‘csv’, ‘csv2’, ‘delim’ or ‘table’. For testing purposes different input files are produced and stored inside of this package

# csv
s_pkg_input_dir <- file.path(here::here(), "inst", "extdata", "ebvinput")
s_csv_input_path <- file.path(s_pkg_input_dir, "ebv_input.csv")
readr::write_csv(tbl_solani_partial, file = s_csv_input_path)

# csv2
s_csv2_input_path <- file.path(s_pkg_input_dir, "ebv_input.csv2")
readr::write_csv(tbl_solani_partial, file = s_csv2_input_path)

# delim - .txt
s_delim_input_path <- file.path(s_pkg_input_dir, "ebv_input.txt")
readr::write_delim(tbl_solani_partial, file = s_delim_input_path, delim = " ")

The input files included in the package can be obtained and read via

sexdf_csv <- qzwslrm_example_input("csv")
tbl_ebv_csv <- readr_ebv(ps_path = sexdf_csv, ps_format = "csv", 
                     ps_animal_col_name = "animal", ps_ebv_col_name = "ebv")
#> Rows: 4995 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (4): animal, nrprg, trait, ebv
#> 
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.
tbl_ebv_csv
#> # A tibble: 4,995 × 2
#>    animal      ebv
#>     <dbl>    <dbl>
#>  1      1 -0.215  
#>  2      2  0.129  
#>  3      3  0.463  
#>  4      4 -0.154  
#>  5      5  0.0779 
#>  6      6 -0.136  
#>  7      7 -0.0913 
#>  8      8 -0.00620
#>  9      9 -0.0198 
#> 10     10  0.170  
#> # … with 4,985 more rows

Testing the input format “table”

sexdf_delim <- qzwslrm_example_input(ps_data_format = "delim")
tbl_ebv_table <- readr_ebv(ps_path = sexdf_delim, ps_format = "table",
                           ps_animal_col_name = "animal", ps_ebv_col_name = "ebv")
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   animal = col_double(),
#>   nrprg = col_double(),
#>   trait = col_double(),
#>   ebv = col_double()
#> )
tbl_ebv_table
#> # A tibble: 4,995 × 2
#>    animal      ebv
#>     <dbl>    <dbl>
#>  1      1 -0.215  
#>  2      2  0.129  
#>  3      3  0.463  
#>  4      4 -0.154  
#>  5      5  0.0779 
#>  6      6 -0.136  
#>  7      7 -0.0913 
#>  8      8 -0.00620
#>  9      9 -0.0198 
#> 10     10  0.170  
#> # … with 4,985 more rows

The generic function readr_ebv() for reading ebv from input files can read data from files with different formats. Furthermore, the input files can contain more information than just EBV. But the result of the reader function is always a tibble with two columns. The first column contains the animal IDs and the second the EBV of the animal.

Summary Output

The output of the validation function is formatted using a summary function summary_lrm().

tbl_ebv_whole <- readr_ebv(ps_path = qzwslrm_example_solani("whole"), ps_format = "table",
                            pn_ebv_col_idx = 4)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_double(),
#>   X2 = col_double(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_logical()
#> )
#> Warning: 1151 parsing failures.
#> row col  expected    actual                                                                             file
#>   5  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   7  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   8  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   9  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>  14  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#> ... ... ......... ......... ................................................................................
#> See problems(...) for more details.
tbl_ebv_partial <- readr_ebv(ps_path = qzwslrm_example_solani("partial"), ps_format = "table",
                            pn_ebv_col_idx = 4)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_double(),
#>   X2 = col_double(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_logical()
#> )
#> Warning: 1156 parsing failures.
#> row col  expected    actual                                                                               file
#>   5  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   7  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   8  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   9  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>  14  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#> ... ... ......... ......... ..................................................................................
#> See problems(...) for more details.
summary_lrm(l_val_result <- val_ebv_lrm(pvec_ebv_partial = tbl_ebv_partial$ebv, 
                             pvec_ebv_whole = tbl_ebv_whole$ebv))
#> 
#> Bias between partial and whole:  -0.0061
#> Regression whole on partial:         1.0377
#> Correlation whole and partial:       0.9873
#> Regression partial on whole:         0.9393

Result Tibble

The results can also be converted into a tibble. This can be useful for an output as a table

 tbl_ebv_whole <- readr_ebv(ps_path = qzwslrm_example_solani("whole"), ps_format = "table",
                            pn_ebv_col_idx = 4)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_double(),
#>   X2 = col_double(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_logical()
#> )
#> Warning: 1151 parsing failures.
#> row col  expected    actual                                                                             file
#>   5  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   7  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   8  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>   9  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#>  14  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/whole_data/Solani'
#> ... ... ......... ......... ................................................................................
#> See problems(...) for more details.
 tbl_ebv_partial <- readr_ebv(ps_path = qzwslrm_example_solani("partial"), ps_format = "table",
                              pn_ebv_col_idx = 4)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_double(),
#>   X2 = col_double(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_logical()
#> )
#> Warning: 1156 parsing failures.
#> row col  expected    actual                                                                               file
#>   5  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   7  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   8  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>   9  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#>  14  -- 5 columns 4 columns '/tmp/RtmpQ6Sl2w/temp_libpath1e0d2c67f2/qzwslrm/extdata/mix99/partial_data/Solani'
#> ... ... ......... ......... ..................................................................................
#> See problems(...) for more details.
 tibble_lrm(l_val_result <- val_ebv_lrm(pvec_ebv_partial = tbl_ebv_partial$ebv,
                                         pvec_ebv_whole = tbl_ebv_whole$ebv))
#> # A tibble: 4 × 2
#>   `Validation Statistic`           Value
#>   <chr>                            <dbl>
#> 1 Bias between partial and whole -0.0061
#> 2 Regression whole on partial     1.04  
#> 3 Correlation whole and partial   0.987 
#> 4 Regression partial on whole     0.939

Scatterplots of EBV from Whole and Partial Data

The function scatterplot_lrm() creates a scatterplot of the EBV from whole and partial data

tbl_ebv_whole <- readr_ebv(ps_path = qzwslrm_example_solani("whole"), ps_format = "table",
                            pn_ebv_col_idx = 4)
tbl_ebv_partial <- readr_ebv(ps_path = qzwslrm_example_solani("partial"), ps_format = "table",
                              pn_ebv_col_idx = 4)
p <- scatterplot_lrm(tbl_ebv_whole, tbl_ebv_partial)
print(p)

The above plot shows a scatterplot of the ebv from whole data and partial data. The blue line is the linear smoothed regression line and the red line is the expected regression line with a slope of \(1\).