THGLab · ptaszeg6 · Jul 28, 2024 · Jul 28, 2024 · Aug 8, 2024 · Oct 3, 2024
diff --git a/CSpred.py b/CSpred.py
@@ -53,13 +53,15 @@ def data_preprocessing(data):
     Add_res_spec_feats(data, include_onehot=False)
     data = feat_pwr(data, hbondd_cols + cos_cols, [2])
     data = feat_pwr(data, hbondd_cols, [-1,-2,-3])
-    dropped_cols = dssp_pp_cols + dssp_energy_cols + ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'FILE_ID', 'PDB_FILE_NAME', 'RESNAME', 'RES_NUM',"RES", 'CHAIN', 'RESNAME_ip1', 'RESNAME_im1', 'BMRB_RES_NUM', 'CG', 'RCI_S2', 'MATCHED_BMRB',"identifier"]+ rcoil_cols
+    dropped_cols = dssp_pp_cols + dssp_energy_cols + ['FILE_ID_x','FILE_ID_y','Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'FILE_ID', 'PDB_FILE_NAME', 'RESNAME', 'RES_NUM',"RES", 'CHAIN', 'RESNAME_ip1', 'RESNAME_im1', 'BMRB_RES_NUM', 'RCI_S2', 'MATCHED_BMRB',"identifier"]+ rcoil_cols
     data = data.drop(set(dropped_cols) & set(data.columns), axis=1)
     return data
 
+
+
 def prepare_data_for_atom(data,atom):
     '''
-    Function to generate features data for a given atom type: meaning that the irrelevant ring current values are removed from features
+    Function to generat features data for a given atom type: meaning that the irrelevant values are removed from the dataset
 
     args:
         data - the dataset that contains all the features (pandas.DataFrame)
@@ -68,14 +70,41 @@ def prepare_data_for_atom(data,atom):
     returns:
         pandas.DataFrame containing the cleaned feature set
     '''
+
     dat = data.copy()
+
+    column_names = dat.columns.tolist()
+    new_column_names = [name.replace('.1', '') if name.endswith('.1') else name for name in column_names]
+    dat.columns = new_column_names
+
+
     ring_col = atom + '_RC'
     rem1 = ring_cols.copy()
     rem1.remove(ring_col)
-    dat = dat.drop(rem1, axis=1)
+    rem2 = [rm_atom + "_RING" for rm_atom in ['C', 'CA', 'CB', 'N', 'HA', 'HA2', 'HA3', 'H', '1H', '1HA', '2HA','CG','CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CG1', 'CG2', 'CZ','HB', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HE3', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','CE3','CZ3','HZ3','CH2','HH2','CZ2','HZ2', 'HB1', 'HD11', 'HD12', 'HD13', 'HD23', 'HG11', 'HZ1', 'HG21', 'HG22', 'HG23','ND2','NE1','NE2']]
+    rem3 = [rm_atom + "_EFIELD" for rm_atom in ['HA2', 'HA3', 'HA', 'H', 'HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG11', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HH11','HH12', 'HD11', 'HD12', 'HD13', 'HH21','HH22', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HZ1', 'HZ2', 'HZ3',  'ND2','NE1','NE2','N'] if rm_atom != atom]
+    rem4 = [rm_atom + "_dHA" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13',  'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
+    rem5 = [rm_atom + "_COS_H" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13',  'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
+    rem6 = [rm_atom + "_COS_A" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HD11', 'HD12', 'HD13', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
+    rem7 = [rm_atom + "_EXISTS" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HD11', 'HD12', 'HD13','HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
+    rem8 = [rm_atom + "_ENERGY" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13',  'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
+    dat = dat.drop(rem1 + rem2 + rem3 + rem4 + rem5 + rem6 + rem7 + rem8, axis=1, errors='ignore')
+
+
+    hbondd_sidechain_cols = [i+j for i in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HH11','HH12', 'HD11', 'HD12', 'HD13', 'HH21','HH22', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23']  for j in ['_dHA', '_COS_H', '_COS_A']]
+    hbondd_sidechain_cols = [element for element in hbondd_sidechain_cols if element.startswith(atom + '_')]
+    #add polynomial transformation of side chain hbonds
+    dat = dat.loc[:, ~dat.columns.duplicated()]
+    dat=feat_pwr(dat,hbondd_sidechain_cols,[-1,-2,-3])
+    dat=feat_pwr(dat,hbondd_sidechain_cols,[2])    
+
     dat[ring_col] = dat[ring_col].fillna(value=0)
+
+
     return dat
 
+
+
 def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
     '''
     Function for calculating chemical shifts for a single PDB file using X module / Y module / both
@@ -111,23 +140,32 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
     if ML:
         print("Generating features ...")
         feats = build_input(pdb_file_name, pH)
+
         feats.rename(index=str, columns=sparta_rename_map, inplace=True) # Rename columns so that random coil columns can be correctly recognized
+
         resnames = feats["RESNAME"]
         resnums = feats["RES_NUM"]
         rcoils = feats[rcoil_cols]
         feats = data_preprocessing(feats)
 
         result = {"RESNUM":resnums, "RESNAME":resnames}
         for atom in toolbox.ATOMS:
+
             print("Calculating UCBShift-X predictions for %s ..." % atom)
-            # Predictions for each atom
+
+           # Predictions for each atom
             atom_feats = prepare_data_for_atom(feats, atom)
+
             r0 = joblib.load(ML_MODEL_PATH + "%s_R0.sav" % atom)
+
+            atom_feats.fillna(0, inplace=True)
             r0_pred = r0.predict(atom_feats.values)
 
             feats_r1 = atom_feats.copy()
             feats_r1["R0_PRED"] = r0_pred
             r1 = joblib.load(ML_MODEL_PATH + "%s_R1.sav" % atom)
+
+
             r1_pred = r1.predict(feats_r1.values)
             # Write ML predictions
             result[atom+"_X"] = r1_pred + rcoils["RCOIL_"+atom]
@@ -152,6 +190,8 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
                 r2_pred = r1_pred.copy()
                 if len(valid_feats_r2):
                     r2 = joblib.load(ML_MODEL_PATH + "%s_R2.sav" % atom)
+                    valid_feats_r2 = valid_feats_r2.fillna(0)
+
                     r2_pred_valid = r2.predict(valid_feats_r2.values)
                     r2_pred[valid] = r2_pred_valid
                 # Write final predictions
@@ -208,3 +248,4 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
 
     print("Complete!")
 
+
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 # UCBShift
 
-UCBShift is a program for predicting chemical shifts for backbone atoms and β-carbon of a protein in solution. The program implements two mechanisms:  a transfer prediction module that employs both sequence alignment and structure alignment to select references for shift replication; and an ensemble decision tree based machine learning module which takes features extracted from a PDB file and makes trustful chemical shift predictions. When combined together, this new predictor achieves state-of-the-art accuracy for predicting chemical shifts in a "real-world" dataset, with root-mean-square errors of  0.38, 0.22, 1.31, 0.97, 1.29 and 2.16 ppm between prediction and experimental values for H, Hα, C, Cα, Cβ and N.
+UCBShift 2.0 is a program for predicting chemical shifts for backbone and side chain atoms of a protein in solution. The program implements two mechanisms:  a transfer prediction module that employs both sequence alignment and structure alignment to select references for shift replication; and an ensemble decision tree based machine learning module which takes features extracted from a PDB file and makes trustful chemical shift predictions. When combined together, this new predictor achieves state-of-the-art accuracy for predicting chemical shifts in a "real-world" dataset, with root-mean-square errors of  0.38, 0.22, 1.31, 0.97, 1.29 and 2.16 ppm between prediction and experimental values for H, Hα, C, Cα, Cβ and N.
 
-## Publication
+## Publications
 Li, J., Bennett, K. C., Liu, Y., Martin, M. V., & Head-Gordon, T. (2020). Accurate prediction of chemical shifts for aqueous protein structure on “Real World” data. _Chemical Science_, 11(12), 3180-3191. DOI: [10.1039/C9SC06561J](https://pubs.rsc.org/en/content/articlehtml/2020/sc/c9sc06561j)
+Ptaszek, A. L., Li, J., Konrat, R., Platzer, G., & Head-Gordon, T. (2024). UCBShift 2.0: Bridging the gap from backbone to side chain protein chemical shift prediction for protein structures. _Journal of the American Chemical Society_, 146(46), 31733-31745. DOI: [10.1021/jacs.4c10474](https://pubs.acs.org/doi/10.1021/jacs.4c10474)
 
-## Using UCBShift through NMRBox
-We recommend users run UCBShift through NMRBox, which provides out-of-box using experience for UCBShift in their virtual machines. You can sign up for NMRBox here: https://nmrbox.nmrhub.org/
+## Using UCBShift 1.0 through NMRBox
+We recommend users run UCBShift 1.0 through NMRBox, which provides out-of-box using experience for UCBShift 1.0 in their virtual machines. You can sign up for NMRBox here: https://nmrbox.nmrhub.org/
 
 ## Software package requirements
 ### Python and python packages
@@ -22,6 +23,7 @@ We recommend users run UCBShift through NMRBox, which provides out-of-box using
 * blast (2.9.0, https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
 * mTM-align (20180725, http://yanglab.nankai.edu.cn/mTM-align/)
 * DSSP (2.04, https://swift.cmbi.umcn.nl/gv/dssp/)
+* reduce (3.23, https://github.com/rlabduke/reduce)
 
 ### Installation notes
 We suggest creating new virtual environments (i.e. anaconda) for the code. You can install the required packages using the provided `requirements.txt` file
@@ -31,7 +33,7 @@ pip install -r requirements.txt
 However you still need to download and install the external programs manually.
 
 ## Usage
-Because the trained models are big, users are directed to [here](https://datadryad.org/stash/share/6vbrswTtNRcHk2vV3e6P1QGH1yYMhvdHDlauysTCObE) to download all the saved model files. After downloading the models.tgz file, extract them into the `models/` folder using the command `tar -xzf models.tgz` (so that there will be 18 .sav files under `models/` folder)<br>
+Because the trained models are big, users are directed to [here](https://doi.org/10.5281/zenodo.15375968) to download all the saved model files. After downloading the models.zip file, extract them into the `models/` folder (so that there will be 141 .sav files under `models/` folder)<br>
 Users can use the trained model "as is" once they have correctly configured the python packages and external programs.
 The [`CSpred.py`](https://github.com/JerryJohnsonLee/CSpred/blob/master/CSpred.py) file is the entrance to UCBShift chemical shift predictor. <br>
 The easiest, out-of-the-box way of using UCBShift is running CSpred.py script directly on your desired protein. A [shifts.csv] file will be generated at the same position where you executed the script. The syntax will be something like this:
@@ -125,6 +127,8 @@ A `shifts.csv` file will be generated by default under the folder where you run
 ## Reproducibility
   You can reproduce the results by preparing all the data and retrain the model on your own machine. Follow [`PROCEDURE.md`](https://github.com/JerryJohnsonLee/CSpred/blob/master/train_model/PROCEDURE.md) under the folder `train_model/` for a complete description of how to train the model.
 
+=======
+
 ## FAQs
 **Q:** I have run into the following issue: 
 ```
@@ -145,3 +149,4 @@ Copyright ©20xx  The Regents of the University of California (Regents). All Rig
 IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+