Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
49 changes: 45 additions & 4 deletions CSpred.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,15 @@ def data_preprocessing(data):
Add_res_spec_feats(data, include_onehot=False)
data = feat_pwr(data, hbondd_cols + cos_cols, [2])
data = feat_pwr(data, hbondd_cols, [-1,-2,-3])
dropped_cols = dssp_pp_cols + dssp_energy_cols + ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'FILE_ID', 'PDB_FILE_NAME', 'RESNAME', 'RES_NUM',"RES", 'CHAIN', 'RESNAME_ip1', 'RESNAME_im1', 'BMRB_RES_NUM', 'CG', 'RCI_S2', 'MATCHED_BMRB',"identifier"]+ rcoil_cols
dropped_cols = dssp_pp_cols + dssp_energy_cols + ['FILE_ID_x','FILE_ID_y','Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'FILE_ID', 'PDB_FILE_NAME', 'RESNAME', 'RES_NUM',"RES", 'CHAIN', 'RESNAME_ip1', 'RESNAME_im1', 'BMRB_RES_NUM', 'RCI_S2', 'MATCHED_BMRB',"identifier"]+ rcoil_cols
data = data.drop(set(dropped_cols) & set(data.columns), axis=1)
return data



def prepare_data_for_atom(data,atom):
'''
Function to generate features data for a given atom type: meaning that the irrelevant ring current values are removed from features
Function to generat features data for a given atom type: meaning that the irrelevant values are removed from the dataset

args:
data - the dataset that contains all the features (pandas.DataFrame)
Expand All @@ -68,14 +70,41 @@ def prepare_data_for_atom(data,atom):
returns:
pandas.DataFrame containing the cleaned feature set
'''

dat = data.copy()

column_names = dat.columns.tolist()
new_column_names = [name.replace('.1', '') if name.endswith('.1') else name for name in column_names]
dat.columns = new_column_names


ring_col = atom + '_RC'
rem1 = ring_cols.copy()
rem1.remove(ring_col)
dat = dat.drop(rem1, axis=1)
rem2 = [rm_atom + "_RING" for rm_atom in ['C', 'CA', 'CB', 'N', 'HA', 'HA2', 'HA3', 'H', '1H', '1HA', '2HA','CG','CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CG1', 'CG2', 'CZ','HB', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HE3', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','CE3','CZ3','HZ3','CH2','HH2','CZ2','HZ2', 'HB1', 'HD11', 'HD12', 'HD13', 'HD23', 'HG11', 'HZ1', 'HG21', 'HG22', 'HG23','ND2','NE1','NE2']]
rem3 = [rm_atom + "_EFIELD" for rm_atom in ['HA2', 'HA3', 'HA', 'H', 'HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG11', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HH11','HH12', 'HD11', 'HD12', 'HD13', 'HH21','HH22', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HZ1', 'HZ2', 'HZ3', 'ND2','NE1','NE2','N'] if rm_atom != atom]
rem4 = [rm_atom + "_dHA" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
rem5 = [rm_atom + "_COS_H" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
rem6 = [rm_atom + "_COS_A" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HD11', 'HD12', 'HD13', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
rem7 = [rm_atom + "_EXISTS" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HD11', 'HD12', 'HD13','HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
rem8 = [rm_atom + "_ENERGY" for rm_atom in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ', 'HD11', 'HD12', 'HD13', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23', 'HG'] if rm_atom != atom]
dat = dat.drop(rem1 + rem2 + rem3 + rem4 + rem5 + rem6 + rem7 + rem8, axis=1, errors='ignore')


hbondd_sidechain_cols = [i+j for i in ['HB', 'HB1', 'HB2', 'HB3', 'HD1', 'HD2', 'HD21', 'HD22', 'HD3', 'HE', 'HE1', 'HE2', 'HE21', 'HE22', 'HG', 'HG1', 'HG12', 'HG13', 'HG2', 'HG3', 'HZ','HH11','HH12', 'HD11', 'HD12', 'HD13', 'HH21','HH22', 'HD23', 'HE3','HZ3','HH2','HZ2', 'HZ1', 'HG21', 'HG22', 'HG23'] for j in ['_dHA', '_COS_H', '_COS_A']]
hbondd_sidechain_cols = [element for element in hbondd_sidechain_cols if element.startswith(atom + '_')]
#add polynomial transformation of side chain hbonds
dat = dat.loc[:, ~dat.columns.duplicated()]
dat=feat_pwr(dat,hbondd_sidechain_cols,[-1,-2,-3])
dat=feat_pwr(dat,hbondd_sidechain_cols,[2])

dat[ring_col] = dat[ring_col].fillna(value=0)


return dat



def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
'''
Function for calculating chemical shifts for a single PDB file using X module / Y module / both
Expand Down Expand Up @@ -111,23 +140,32 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
if ML:
print("Generating features ...")
feats = build_input(pdb_file_name, pH)

feats.rename(index=str, columns=sparta_rename_map, inplace=True) # Rename columns so that random coil columns can be correctly recognized

resnames = feats["RESNAME"]
resnums = feats["RES_NUM"]
rcoils = feats[rcoil_cols]
feats = data_preprocessing(feats)

result = {"RESNUM":resnums, "RESNAME":resnames}
for atom in toolbox.ATOMS:

print("Calculating UCBShift-X predictions for %s ..." % atom)
# Predictions for each atom

# Predictions for each atom
atom_feats = prepare_data_for_atom(feats, atom)

r0 = joblib.load(ML_MODEL_PATH + "%s_R0.sav" % atom)

atom_feats.fillna(0, inplace=True)
r0_pred = r0.predict(atom_feats.values)

feats_r1 = atom_feats.copy()
feats_r1["R0_PRED"] = r0_pred
r1 = joblib.load(ML_MODEL_PATH + "%s_R1.sav" % atom)


r1_pred = r1.predict(feats_r1.values)
# Write ML predictions
result[atom+"_X"] = r1_pred + rcoils["RCOIL_"+atom]
Expand All @@ -152,6 +190,8 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):
r2_pred = r1_pred.copy()
if len(valid_feats_r2):
r2 = joblib.load(ML_MODEL_PATH + "%s_R2.sav" % atom)
valid_feats_r2 = valid_feats_r2.fillna(0)

r2_pred_valid = r2.predict(valid_feats_r2.values)
r2_pred[valid] = r2_pred_valid
# Write final predictions
Expand Down Expand Up @@ -208,3 +248,4 @@ def calc_sing_pdb(pdb_file_name,pH=5,TP=True,TP_pred=None,ML=True,test=False):

print("Complete!")


15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# UCBShift

UCBShift is a program for predicting chemical shifts for backbone atoms and β-carbon of a protein in solution. The program implements two mechanisms: a transfer prediction module that employs both sequence alignment and structure alignment to select references for shift replication; and an ensemble decision tree based machine learning module which takes features extracted from a PDB file and makes trustful chemical shift predictions. When combined together, this new predictor achieves state-of-the-art accuracy for predicting chemical shifts in a "real-world" dataset, with root-mean-square errors of 0.38, 0.22, 1.31, 0.97, 1.29 and 2.16 ppm between prediction and experimental values for H, Hα, C, Cα, Cβ and N.
UCBShift 2.0 is a program for predicting chemical shifts for backbone and side chain atoms of a protein in solution. The program implements two mechanisms: a transfer prediction module that employs both sequence alignment and structure alignment to select references for shift replication; and an ensemble decision tree based machine learning module which takes features extracted from a PDB file and makes trustful chemical shift predictions. When combined together, this new predictor achieves state-of-the-art accuracy for predicting chemical shifts in a "real-world" dataset, with root-mean-square errors of 0.38, 0.22, 1.31, 0.97, 1.29 and 2.16 ppm between prediction and experimental values for H, Hα, C, Cα, Cβ and N.

## Publication
## Publications
Li, J., Bennett, K. C., Liu, Y., Martin, M. V., & Head-Gordon, T. (2020). Accurate prediction of chemical shifts for aqueous protein structure on “Real World” data. _Chemical Science_, 11(12), 3180-3191. DOI: [10.1039/C9SC06561J](https://pubs.rsc.org/en/content/articlehtml/2020/sc/c9sc06561j)
Ptaszek, A. L., Li, J., Konrat, R., Platzer, G., & Head-Gordon, T. (2024). UCBShift 2.0: Bridging the gap from backbone to side chain protein chemical shift prediction for protein structures. _Journal of the American Chemical Society_, 146(46), 31733-31745. DOI: [10.1021/jacs.4c10474](https://pubs.acs.org/doi/10.1021/jacs.4c10474)

## Using UCBShift through NMRBox
We recommend users run UCBShift through NMRBox, which provides out-of-box using experience for UCBShift in their virtual machines. You can sign up for NMRBox here: https://nmrbox.nmrhub.org/
## Using UCBShift 1.0 through NMRBox
We recommend users run UCBShift 1.0 through NMRBox, which provides out-of-box using experience for UCBShift 1.0 in their virtual machines. You can sign up for NMRBox here: https://nmrbox.nmrhub.org/

## Software package requirements
### Python and python packages
Expand All @@ -22,6 +23,7 @@ We recommend users run UCBShift through NMRBox, which provides out-of-box using
* blast (2.9.0, https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
* mTM-align (20180725, http://yanglab.nankai.edu.cn/mTM-align/)
* DSSP (2.04, https://swift.cmbi.umcn.nl/gv/dssp/)
* reduce (3.23, https://github.com/rlabduke/reduce)

### Installation notes
We suggest creating new virtual environments (i.e. anaconda) for the code. You can install the required packages using the provided `requirements.txt` file
Expand All @@ -31,7 +33,7 @@ pip install -r requirements.txt
However you still need to download and install the external programs manually.

## Usage
Because the trained models are big, users are directed to [here](https://datadryad.org/stash/share/6vbrswTtNRcHk2vV3e6P1QGH1yYMhvdHDlauysTCObE) to download all the saved model files. After downloading the models.tgz file, extract them into the `models/` folder using the command `tar -xzf models.tgz` (so that there will be 18 .sav files under `models/` folder)<br>
Because the trained models are big, users are directed to [here](https://doi.org/10.5281/zenodo.15375968) to download all the saved model files. After downloading the models.zip file, extract them into the `models/` folder (so that there will be 141 .sav files under `models/` folder)<br>
Users can use the trained model "as is" once they have correctly configured the python packages and external programs.
The [`CSpred.py`](https://github.com/JerryJohnsonLee/CSpred/blob/master/CSpred.py) file is the entrance to UCBShift chemical shift predictor. <br>
The easiest, out-of-the-box way of using UCBShift is running CSpred.py script directly on your desired protein. A [shifts.csv] file will be generated at the same position where you executed the script. The syntax will be something like this:
Expand Down Expand Up @@ -125,6 +127,8 @@ A `shifts.csv` file will be generated by default under the folder where you run
## Reproducibility
You can reproduce the results by preparing all the data and retrain the model on your own machine. Follow [`PROCEDURE.md`](https://github.com/JerryJohnsonLee/CSpred/blob/master/train_model/PROCEDURE.md) under the folder `train_model/` for a complete description of how to train the model.

=======

## FAQs
**Q:** I have run into the following issue:
```
Expand All @@ -145,3 +149,4 @@ Copyright ©20xx The Regents of the University of California (Regents). All Rig
IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

Loading