COMBAI computational biology and artificial intelligence

Cancer classification system

(beta version)

This Cancer classification system was generated from our recent study "Noncoding RNAs and deep learning neural network discriminate multi-cancer types"
by Anyou Wang, Rong Hai,Paul J Rider and Qianchuan He

Detecting cancers at early stages can dramatically reduce mortality rates. Therefore, practical cancer screening at the population level is needed. Here, we develop a comprehensive detection system to classify all common cancer types. By integrating artificial intelligence deep learning neural network and noncoding RNA biomarkers selected from massive data, our system can accurately detect cancer vs healthy object with 96.3% of AUC of ROC (Area Under Curve of a Receiver Operating Characteristic curve). Intriguinely, with no more than 6 biomarkers, our approach can easily discriminate any individual cancer type vs normal with 99% to 100% AUC. Furthermore, a comprehensive marker panel can simultaneously multi-classify all common cancers with a stable 82% of accuracy at heterological cancerous tissues and conditions. This provides a valuable framework for large scale cancer screening.

binary classifier

  • def myModel(train_x):
  •         mymodel = Sequential([ 
              Dense(30, input_dim=len(train_x.columns), activation='relu'),
              Dropout(0.1),
              Dense(60, activation='relu'),
              Dropout(0.1),
              Dense(1, activation='sigmoid')
               ])
              mymodel.compile(optimizer='adam',loss='binary_crossentropy',
                      metrics=['accuracy'])
            return mymodel
          

    Typical run

          
          
            nnt = myModel(train_x)
            history=nnt.fit(x=train_x,y=train_y,
                 validation_data=(val_x, val_y),batch_size=20,epochs=30,shuffle=True,verbose=0)
         
          
                 

    An example run for three math model comparison

  •        
    
    import os,pickle
    
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Activation, Dense,Dropout
    from tensorflow.keras import layers
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.metrics import categorical_crossentropy,binary_crossentropy
    from sklearn.utils import shuffle
    
    import pandas as pd
    import numpy as np
    from scipy import interp
    import matplotlib.pyplot as plt
    from itertools import cycle
    from sklearn.metrics import roc_curve, auc
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn import tree
    
    ###NN model
    def myModel(train_x):
            mymodel = Sequential([ 
              Dense(30, input_dim=len(train_x.columns), activation='relu'),
              Dropout(0.1),
              Dense(60, activation='relu'),
              Dropout(0.1),
              Dense(1, activation='sigmoid')
               ])
            mymodel.compile(optimizer='adam',loss='binary_crossentropy',
                      metrics=['accuracy'])
            return mymodel
    
    # ROC Plot
    def plotROC(i,fpr_nnt, tpr_nnt,auc_nnt,fpr_rfc, tpr_rfc,auc_rfc,fpr_td, tpr_td,auc_td):
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr_nnt, tpr_nnt, label='NNT (area = {:.3f})'.format(auc_nnt))
        plt.plot(fpr_rfc, tpr_rfc, label='RF (area = {:.3f})'.format(auc_rfc))
        plt.plot(fpr_td, tpr_td, label='TD (area = {:.3f})'.format(auc_td))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title("Number: "+str(i)+ '_ROC curve')
        plt.legend(loc='best')
        plt.savefig("Marker_number:"+str(i)+ '_ROC.'+'pdf')
        plt.savefig("Marker_number:"+str(i)+ '_ROC.'+'png')
        plt.close()
        return
    
    
    ###model comparison
    def seriesRun(X1):
       auc_dic={}
       for i in range(1,len(X1.columns)):
          x1=X1.iloc[:,:i]
          x1['target']=X1['target']
          train, test = train_test_split(x1, test_size=0.2)
          train, val = train_test_split(train, test_size=0.2)
          train_y=train['target']
          train_x=train.drop(columns=['target'])
          val_y=val['target']
          val_x=val.drop(columns=['target'])
          test_y=test['target']
          test_x=test.drop(columns=['target'])
          nnt = myModel(train_x)
          history=nnt.fit(x=train_x,y=train_y,
                      validation_data=(val_x, val_y),batch_size=20,epochs=30,shuffle=True,verbose=0)
          h = history.history['accuracy']
          v = history.history['val_accuracy']
          df = pd.DataFrame(list(zip(h, v)), columns=["acc", "val_acc"])
          df.to_csv("total_vs_c_" + "_" + str(i) + "_" + "ACCURACY_matrix.csc")
          pred_nnt = nnt.predict(test_x,batch_size=20,verbose=0).ravel()
          fpr_nnt, tpr_nnt, thresholds_nnt = roc_curve(test_y, pred_nnt)
          auc_nnt = auc(fpr_nnt, tpr_nnt)
          rfc = RandomForestClassifier(max_depth=3, n_estimators=40)
          rfc.fit(train_x, train_y)
          pred_rfc = rfc.predict_proba(test_x)[:, 1]
          fpr_rfc, tpr_rfc, cutoff_rfc = roc_curve(test_y, pred_rfc)
          auc_rfc = auc(fpr_rfc, tpr_rfc)
          td = tree.DecisionTreeRegressor(random_state=0, max_depth=2)
          td = td.fit(train_x, train_y)
          pred_td=td.predict(test_x)
          fpr_td, tpr_td, cutoff_td = roc_curve(test_y, pred_td)
          auc_td = auc(fpr_td, tpr_td)
          auc_dic[str(i)]=[auc_nnt,auc_rfc,auc_td]
          plotROC(i,fpr_nnt, tpr_nnt,auc_nnt,fpr_rfc, tpr_rfc,auc_rfc,fpr_td, tpr_td,auc_td)
       return auc_dic   
           
           

    To search biomarker database, please enter a gene ID or symbol based on GRCh38.p2.v22

    For example,ENSG00000213700.3 or RPL17P50

    References