Import the e.coli dataset from bigg and create a matrix in R

Here are the libraries needed

library(jsonlite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(R.matlab)
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
## 
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
## 
##     getOption, isOpen
library(volesti)
## Loading required package: Rcpp

First, we try the .json format

We first get the data

data_url = "http://bigg.ucsd.edu/static/models/e_coli_core.json"
destination_file = "/home/haris/Desktop/gsoc2020/gsoc2020/e_coli_core.json"
download.file(data_url, destination_file)

Then we parse them

We need to parse the .json file we got, to get a matrix with the reactions and the metabolites that participate in them.

data = jsonlite::fromJSON( destination_file )

reactions = data$reactions

new_data = data["reactions"]


final_reactions = new_data$reactions$metabolites
reactions_names = reactions$id
table = cbind(reactions_names,final_reactions)

table_as_matrix = as.matrix(table)

head(table_as_matrix, 4)
##   reactions_names adp_c   atp_c    f6p_c     fdp_c h_c     accoa_c   coa_c    
## 1 "PFK"           " 1.00" " -1.00" "-1.0000" " 1"  " 1.00" NA        NA       
## 2 "PFL"           NA      NA       NA        NA    NA      " 1.0000" "-1.0000"
## 3 "PGI"           NA      NA       " 1.0000" NA    NA      NA        NA       
## 4 "PGK"           " 1.00" " -1.00" NA        NA    NA      NA        NA       
##   for_c pyr_c     g6p_c    13dpg_c 3pg_c    6pgc_c 6pgl_c h2o_c acald_c nad_c
## 1 NA    NA        NA       NA      NA       NA     NA     NA    NA      NA   
## 2 " 1"  "-1.0000" NA       NA      NA       NA     NA     NA    NA      NA   
## 3 NA    NA        "-1.000" NA      NA       NA     NA     NA    NA      NA   
## 4 NA    NA        NA       " 1"    "-1.000" NA     NA     NA    NA      NA   
##   nadh_c akg_c akg_e h_e 2pg_c pi_c pi_e etoh_c acald_e ac_c actp_c co2_c oaa_c
## 1 NA     NA    NA    NA  NA    NA   NA   NA     NA      NA   NA     NA    NA   
## 2 NA     NA    NA    NA  NA    NA   NA   NA     NA      NA   NA     NA    NA   
## 3 NA     NA    NA    NA  NA    NA   NA   NA     NA      NA   NA     NA    NA   
## 4 NA     NA    NA    NA  NA    NA   NA   NA     NA      NA   NA     NA    NA   
##   pep_c acon_C_c cit_c icit_c ac_e amp_c succoa_c e4p_c g3p_c gln__L_c glu__L_c
## 1 NA    NA       NA    NA     NA   NA    NA       NA    NA    NA       NA      
## 2 NA    NA       NA    NA     NA   NA    NA       NA    NA    NA       NA      
## 3 NA    NA       NA    NA     NA   NA    NA       NA    NA    NA       NA      
## 4 NA    NA       NA    NA     NA   NA    NA       NA    NA    NA       NA      
##   nadp_c nadph_c r5p_c pyr_e co2_e ru5p__D_c xu5p__D_c succ_c succ_e o2_c q8_c
## 1 NA     NA      NA    NA    NA    NA        NA        NA     NA     NA   NA  
## 2 NA     NA      NA    NA    NA    NA        NA        NA     NA     NA   NA  
## 3 NA     NA      NA    NA    NA    NA        NA        NA     NA     NA   NA  
## 4 NA     NA      NA    NA    NA    NA        NA        NA     NA     NA   NA  
##   q8h2_c lac__D_c lac__D_e etoh_e fum_c s7p_c dhap_c for_e fru_e fum_e glc__D_e
## 1 NA     NA       NA       NA     NA    NA    NA     NA    NA    NA    NA      
## 2 NA     NA       NA       NA     NA    NA    NA     NA    NA    NA    NA      
## 3 NA     NA       NA       NA     NA    NA    NA     NA    NA    NA    NA      
## 4 NA     NA       NA       NA     NA    NA    NA     NA    NA    NA    NA      
##   gln__L_e glu__L_e h2o_e mal__L_e nh4_e o2_e mal__L_c nh4_c glx_c
## 1 NA       NA       NA    NA       NA    NA   NA       NA    NA   
## 2 NA       NA       NA    NA       NA    NA   NA       NA    NA   
## 3 NA       NA       NA    NA       NA    NA   NA       NA    NA   
## 4 NA       NA       NA    NA       NA    NA   NA       NA    NA

Result

Here you can see finally how a single reaction from the network we had, can be seen in the first line of our matrix. One molecule of f6p_c and one atp_c are cosnumed to produce one of adp_c, one of h_c and one of fdp_c.

One of the reactions of the metabolic network. You can see the metabolites that are consumed as well as those that are produced; the network (left) confirms what the matrix shows (right).

One of the reactions of the metabolic network. You can see the metabolites that are consumed as well as those that are produced; the network (left) confirms what the matrix shows (right).

Second, the .mat format

Get the data.. again

data_url = "http://bigg.ucsd.edu/static/models/e_coli_core.mat"
destination_file_mat = "/home/haris/Desktop/gsoc2020/gsoc2020/e_coli_core.mat"
download.file(data_url, destination_file)

Read the data and parse them properly

parse_data_from_mat_file <- function(destination_file) {
  data_from_mat = readMat(destination_file)

  s_matrix = data_from_mat[1]
  s_matrix = s_matrix[[1]]
  s_matrix = matrix(s_matrix,ncol = ncol(s_matrix), nrow = nrow(s_matrix))
  
  reactions_from_mat = s_matrix[8]
  metabolites_from_mat = s_matrix[1]
  genes_from_mat = s_matrix[5]

  
  # lb = data_from_mat[6]
  # lb = lb[[1]]
  # d = dim(s_matrix)[2]
  # d = length(lb)
  # A = rbind(diag(d), -diag(d))
  
    
  data_object = list(s_matrix, reactions_from_mat, metabolites_from_mat, genes_from_mat)
  
  return(data_object)
}

new_parse = parse_data_from_mat_file(destination_file_mat)

and what we have is this:

An enzyme from those that catalyze the reactions:

new_parse[[2]][[1]][[1]]
## [[1]]
##      [,1] 
## [1,] "PFK"

The metabolites that take part:

new_parse[[3]][[1]][[1]]
## [[1]]
##      [,1]      
## [1,] "glc__D_e"

And the genes from which these molecules come from:

new_parse[[4]][[1]][[1]]
## [[1]]
##      [,1]   
## [1,] "b1241"

Discussion

It seems that the .mat format is easier to handle. However, as this project will be implemented to a great extent in Python, we need to handle the .json format as well. That is because .json format is commonly used when working with Python.