Higgs Boson Machine Learning

Here is simple network to identify Higgs signal using Neural network.

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import random
import seaborn as sns
In [2]:
Filename = "training.csv"

n = sum(1 for line in open(Filename)) - 1 
s = 200000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) 
df = pd.read_csv(Filename, skiprows=skip)

df = df.replace({'s': 1, 'b': 0})
In [3]:
print(df.head())
   EventId  DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  DER_pt_h  \
0   100000       138.470                       51.655        97.827    27.980   
1   100002      -999.000                      162.172       125.953    35.635   
2   100004       175.864                       16.915       134.805    16.405   
3   100005        89.744                       13.550        59.149   116.344   
4   100006       148.754                       28.862       107.782   106.130   

   DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0                 0.910           124.711                2.666   
1              -999.000          -999.000             -999.000   
2              -999.000          -999.000             -999.000   
3                 2.636           284.584               -0.540   
4                 0.733           158.359                0.113   

   DER_deltar_tau_lep  DER_pt_tot  ...    PRI_jet_num  PRI_jet_leading_pt  \
0               3.064      41.928  ...              2              67.435   
1               3.148       9.336  ...              1              44.251   
2               3.891      16.405  ...              0            -999.000   
3               1.362      61.619  ...              3              90.547   
4               2.941       2.545  ...              2             123.010   

   PRI_jet_leading_eta  PRI_jet_leading_phi  PRI_jet_subleading_pt  \
0                2.150                0.444                 46.062   
1                2.053               -2.028               -999.000   
2             -999.000             -999.000               -999.000   
3               -2.412               -0.653                 56.165   
4                0.864                1.450                 56.867   

   PRI_jet_subleading_eta  PRI_jet_subleading_phi  PRI_jet_all_pt    Weight  \
0                   1.240                  -2.475         113.497  0.002653   
1                -999.000                -999.000          44.251  2.347389   
2                -999.000                -999.000           0.000  6.245333   
3                   0.224                   3.106         193.660  0.083414   
4                   0.131                  -2.767         179.877  0.002653   

   Label  
0      1  
1      0  
2      0  
3      0  
4      1  

[5 rows x 33 columns]
In [4]:
corr_df = df.corr(method='pearson')
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,10))
# Create the heatmap using seaborn library. 

sns.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
 
# The plot is reoriented the labels for each column and row to make them easier to read.
plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()
In [5]:
columns = ['EventId','PRI_tau_phi', 'PRI_lep_phi', 'PRI_met_phi', 'PRI_tau_eta', 'PRI_lep_eta', 'Weight']
df = df.drop(columns, axis=1 )

corr_df = df.corr(method='pearson')
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,10))

sns.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)

plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()
In [6]:
Filename = "training.csv"
n = sum(1 for line in open(Filename)) - 1
s = 10000 
skip = sorted(random.sample(range(1,n+1),n-s)) 
df_test= pd.read_csv(Filename, skiprows=skip)
df_test = df_test.replace({'s': 1, 'b': 0})
df_test = df_test.drop(columns, axis=1 )
In [7]:
X = np.array(df.drop(['Label'], 1))
y = np.array(df['Label'])
y = np.array(df['Label']).reshape(len(df.index),1)

X_train = X
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_train = y_onehot

print ((X_train.shape))


X_test = np.array(df_test.drop(['Label'], 1))
y_test = np.array(df_test['Label'])
y_test = np.array(df_test['Label']).reshape(len(df_test.index),1)

encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y_test)
y_test = y_onehot

print ((X_test.shape, y_test.shape))
(200000, 25)
((10000, 25), (10000, 2))
In [8]:
n_nodes_hl1 = 400
n_nodes_hl2 = 400
n_nodes_hl3 = 400

n_classes = 2
batch_size = 1000

n_batches = -(-X_train.shape[0] // batch_size)

print('n_batches =', n_batches)

n_feature = X_test.shape[1]
X = tf.placeholder('float', [None, n_feature])
y = tf.placeholder('float')
print(n_feature)
n_batches = 200
25

Simple 3 layered network trained with AdamOptimizer, no dropout

In [9]:
def neural_network_model(data):
    
    hidden_1_layer = {'weights': tf.Variable(tf.random_normal([n_feature,n_nodes_hl1])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}
    
    hidden_2_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
    
    hidden_3_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
    
    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                     'biases': tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(X, hidden_1_layer['weights']) , hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)
            
    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']) , hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)
        
    l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']) , hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)
    
    
    output = tf.matmul(l3, output_layer['weights']) + output_layer['biases']
    
    return(output)

Simple 3 layered network trained with AdamOptimizer, with dropout

In [10]:
def neural_network_model_dropout(data):
    
    hidden_1_layer = {'weights': tf.Variable(tf.random_normal([n_feature,n_nodes_hl1])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}
    
    hidden_2_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
    
    hidden_3_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                     'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
    
    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                     'biases': tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(X, hidden_1_layer['weights']) , hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)
    l1 = tf.nn.dropout(l1, 0.8)
        
    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']) , hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)
    l2 = tf.nn.dropout(l2, 0.8)
    
    l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']) , hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)
    
    
    output = tf.matmul(l3, output_layer['weights']) + output_layer['biases']
    
    return(output)
In [11]:
def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs =501
    
    with tf.Session() as sess: 
        sess.run(tf.initialize_all_variables())
        
        for epoch in range(hm_epochs):
            epoch_loss=0
            _, c  = sess.run([optimizer, cost], feed_dict={x:X_train, y:y_train})
            epoch_loss += c
            if epoch%10 == 0:
                print('Epoch', epoch, 'Out of', hm_epochs, 'loss:', epoch_loss )
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy:', accuracy.eval({x:X_test, y:y_test}))
        
In [12]:
def train_neural_network_bd(x):
    prediction = neural_network_model_dropout(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs =501
    
    with tf.Session() as sess: 
        sess.run(tf.initialize_all_variables())
        
        for epoch in range(hm_epochs):
            epoch_loss=0
            for i in range(n_batches):
                _, c  = sess.run([optimizer, cost], feed_dict={x:X_train[i*batch_size : ((i+1)*batch_size)],
                                                                   y:y_train[i*batch_size : ((i+1)*batch_size)]})
                epoch_loss += c
            if epoch%10 == 0:
                print('Epoch', epoch, 'Out of', hm_epochs, 'loss:', epoch_loss )
        
        
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy:', accuracy.eval({x:X_test, y:y_test}))
In [13]:
train_neural_network(X)
WARNING:tensorflow:From /home/ninad/packages/anacoda3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch 0 Out of 501 loss: 4248129.5
Epoch 10 Out of 501 loss: 588927.25
Epoch 20 Out of 501 loss: 665489.1875
Epoch 30 Out of 501 loss: 310368.1875
Epoch 40 Out of 501 loss: 248956.359375
Epoch 50 Out of 501 loss: 131110.734375
Epoch 60 Out of 501 loss: 66554.640625
Epoch 70 Out of 501 loss: 45131.5820312
Epoch 80 Out of 501 loss: 33880.3125
Epoch 90 Out of 501 loss: 44801.5859375
Epoch 100 Out of 501 loss: 46364.4726562
Epoch 110 Out of 501 loss: 64384.5546875
Epoch 120 Out of 501 loss: 70244.90625
Epoch 130 Out of 501 loss: 53593.9726562
Epoch 140 Out of 501 loss: 69703.84375
Epoch 150 Out of 501 loss: 35292.7851562
Epoch 160 Out of 501 loss: 57737.6367188
Epoch 170 Out of 501 loss: 51565.6953125
Epoch 180 Out of 501 loss: 34899.71875
Epoch 190 Out of 501 loss: 23466.609375
Epoch 200 Out of 501 loss: 20929.8691406
Epoch 210 Out of 501 loss: 55451.4375
Epoch 220 Out of 501 loss: 27458.6582031
Epoch 230 Out of 501 loss: 32243.96875
Epoch 240 Out of 501 loss: 42088.0039062
Epoch 250 Out of 501 loss: 32162.7363281
Epoch 260 Out of 501 loss: 21200.9453125
Epoch 270 Out of 501 loss: 25628.1445312
Epoch 280 Out of 501 loss: 36435.953125
Epoch 290 Out of 501 loss: 20904.4765625
Epoch 300 Out of 501 loss: 28408.4433594
Epoch 310 Out of 501 loss: 97652.828125
Epoch 320 Out of 501 loss: 48561.0507812
Epoch 330 Out of 501 loss: 42596.4140625
Epoch 340 Out of 501 loss: 32428.6464844
Epoch 350 Out of 501 loss: 39913.8164062
Epoch 360 Out of 501 loss: 36111.375
Epoch 370 Out of 501 loss: 24992.0898438
Epoch 380 Out of 501 loss: 19410.9003906
Epoch 390 Out of 501 loss: 18877.3203125
Epoch 400 Out of 501 loss: 63965.625
Epoch 410 Out of 501 loss: 68210.84375
Epoch 420 Out of 501 loss: 60282.4257812
Epoch 430 Out of 501 loss: 27771.6425781
Epoch 440 Out of 501 loss: 59964.6523438
Epoch 450 Out of 501 loss: 18850.5117188
Epoch 460 Out of 501 loss: 30438.3222656
Epoch 470 Out of 501 loss: 16386.4296875
Epoch 480 Out of 501 loss: 14978.4804688
Epoch 490 Out of 501 loss: 20133.5175781
Epoch 500 Out of 501 loss: 65219.6367188
Accuracy: 0.654
In [14]:
train_neural_network_bd(X)
WARNING:tensorflow:From /home/ninad/packages/anacoda3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch 0 Out of 501 loss: 217078652.094
Epoch 10 Out of 501 loss: 7115170.89453
Epoch 20 Out of 501 loss: 2157362.43701
Epoch 30 Out of 501 loss: 510095.859131
Epoch 40 Out of 501 loss: 173491.126068
Epoch 50 Out of 501 loss: 81760.0101013
Epoch 60 Out of 501 loss: 27440.1593056
Epoch 70 Out of 501 loss: 17744.4905608
Epoch 80 Out of 501 loss: 15279.0385512
Epoch 90 Out of 501 loss: 9634.80531156
Epoch 100 Out of 501 loss: 3735.09218502
Epoch 110 Out of 501 loss: 585.104119599
Epoch 120 Out of 501 loss: 229.426472962
Epoch 130 Out of 501 loss: 155.39653331
Epoch 140 Out of 501 loss: 138.206129372
Epoch 150 Out of 501 loss: 211.281238377
Epoch 160 Out of 501 loss: 149.054277241
Epoch 170 Out of 501 loss: 139.443556726
Epoch 180 Out of 501 loss: 128.314543724
Epoch 190 Out of 501 loss: 198.53320998
Epoch 200 Out of 501 loss: 133.744871438
Epoch 210 Out of 501 loss: 161.162532628
Epoch 220 Out of 501 loss: 128.514085591
Epoch 230 Out of 501 loss: 126.718143165
Epoch 240 Out of 501 loss: 126.734203398
Epoch 250 Out of 501 loss: 126.861549616
Epoch 260 Out of 501 loss: 126.773496449
Epoch 270 Out of 501 loss: 126.716407478
Epoch 280 Out of 501 loss: 126.688420892
Epoch 290 Out of 501 loss: 138.359019518
Epoch 300 Out of 501 loss: 130.69383508
Epoch 310 Out of 501 loss: 139.704911709
Epoch 320 Out of 501 loss: 179.578865826
Epoch 330 Out of 501 loss: 195.577742159
Epoch 340 Out of 501 loss: 368.088497519
Epoch 350 Out of 501 loss: 126.67974025
Epoch 360 Out of 501 loss: 126.867865264
Epoch 370 Out of 501 loss: 126.734449625
Epoch 380 Out of 501 loss: 203.432457328
Epoch 390 Out of 501 loss: 126.707312822
Epoch 400 Out of 501 loss: 128.315523684
Epoch 410 Out of 501 loss: 126.736176133
Epoch 420 Out of 501 loss: 126.74873054
Epoch 430 Out of 501 loss: 126.747906089
Epoch 440 Out of 501 loss: 133.778655112
Epoch 450 Out of 501 loss: 126.711929977
Epoch 460 Out of 501 loss: 142.796718895
Epoch 470 Out of 501 loss: 136.198312879
Epoch 480 Out of 501 loss: 126.671968162
Epoch 490 Out of 501 loss: 127.611753583
Epoch 500 Out of 501 loss: 137.111907125
Accuracy: 0.6547
In [ ]: