# Imports

In [1]:
import pandas as pd
import os
import io
from utils import read_file
from utils import convert_to_unsigned_np
from PIL import Image
import numpy as np
from scipy import signal 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import requests  # pip install requests 
import sys
import numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
import scipy
import torch
from utils import Conv
from utils import make_conv_feat
from sklearn.linear_model import LogisticRegression as LR



In [2]:
import scipy

scipy.__version__

'1.12.0'

# variable names

In [3]:
DIR = "/data/isip/www/isip/courses/temple/ece_8527/resources/data/set_15/lists"
N = 20000

# Data Loading

### Labels

In [4]:
training_dir = "data_train_healthy.csv"
training_path = os.path.join(DIR,training_dir)
df_train_healthy = pd.read_csv(training_path)

training_dir = "data_train_unhealthy.csv"
training_path = os.path.join(DIR,training_dir)
df_train_unhealthy = pd.read_csv(training_path)

dev_dir = "data_dev_healthy.csv"
dev_path = os.path.join(DIR,dev_dir)
df_dev_healthy = pd.read_csv(dev_path)

dev_dir = "data_dev_unhealthy.csv"
dev_path = os.path.join(DIR,dev_dir)
df_dev_unhealthy = pd.read_csv(dev_path)





In [5]:
df_train_unhealthy.mean(axis=0)

1dAVb    0.178496
RBBB     0.355484
LBBB     0.178878
SB       0.183853
AF       0.209394
ST       0.178496
dtype: float64

### Features

In [28]:
train_unhealthy_list = read_file(os.path.join(DIR,"data_train_unhealthy.list"))
train_healthy_list = read_file(os.path.join(DIR,"data_train_healthy.list"))
dev_unhealthy_list = read_file(os.path.join(DIR,"data_dev_unhealthy.list"))
dev_healthy_list = read_file(os.path.join(DIR,"data_dev_healthy.list"))

train_unhealthy_list= train_unhealthy_list.strip().split("\n") #file names - strip gets rid of blank space - split turns it into a list of strings
assert len(train_unhealthy_list) == df_train_unhealthy.shape[0], "train data not right shape"

train_healthy_list= train_healthy_list.strip().split("\n")
assert len(train_healthy_list) == df_train_healthy.shape[0], "train data not right shape"

dev_unhealthy_list= dev_unhealthy_list.strip().split("\n")
assert len(dev_unhealthy_list) == df_dev_unhealthy.shape[0], "train data not right shape"

dev_healthy_list= dev_healthy_list.strip().split("\n")
assert len(dev_healthy_list) == df_dev_healthy.shape[0], "train data not right shape"

eval_list = read_file(os.path.join(DIR,"data_eval.list"))
eval_list=eval_list.strip().split("\n") #file names - strip gets rid of blank space - split turns it into a list of strings


In [30]:
eval_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(eval_list)] #reading in .dat to list of arrays
dev_unhealthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(dev_unhealthy_list) ] #reading in .dat to list of arrays
dev_healthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(dev_healthy_list) ]

In [None]:
#train_unhealthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for x in train_unhealthy_list]
#train_healthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for x in train_healthy_list]

train_unhealthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(train_unhealthy_list) ] #reading in .dat to list of arrays
train_healthy_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(train_healthy_list) ]
eval_array = [np.frombuffer(read_file(x),dtype=np.short).reshape((-1,8)) for j,x in enumerate(eval_list)] #reading in .dat to list of arrays

## Data processing

In [31]:
#train_healthy_array 
#train_unhealthy_array 

#seperate the channels and turn list of arrays into single array
#first dimension is the data 
#second dimension is the time
#3rd dimension is channels 

healthytemp = np.stack(dev_healthy_array, axis = 2).transpose((2,0,1))
unhealthytemp = np.stack(dev_unhealthy_array, axis = 2).transpose((2,0,1))

X = np.concatenate([healthytemp,unhealthytemp], axis = 0)
#X = unhealthytemp
X.shape


(20000, 2200, 8)

In [38]:
X = np.stack(eval_array, axis = 2).transpose((2,0,1))
X.shape

(20000, 2200, 8)

In [19]:
#train_healthy_array 
#train_unhealthy_array 

#seperate the channels and turn list of arrays into single array
#first dimension is the data 
#second dimension is the time
#3rd dimension is channels 

healthytemp = np.stack(train_healthy_array, axis = 2).transpose((2,0,1))
unhealthytemp = np.stack(train_unhealthy_array, axis = 2).transpose((2,0,1))

X = np.concatenate([healthytemp,unhealthytemp], axis = 0)
#X = unhealthytemp
X.shape


(199487, 2200, 8)

In [32]:
y = pd.concat([df_dev_healthy,df_dev_unhealthy]) # process labels 
#y = df_train_unhealthy.head(N)
print(y.shape)


# zeros for class healthy
# ones for class unhealthy

(20000, 6)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

## Feature engineering

avg heart beat is .8 seconds
take area under the curve




In [21]:
kernel_sizes = [60,120,180,240,300]# adjust kernel sizes to change model performance
def feature_eng(X,kernel_sizes = kernel_sizes): 
    X = X/255.0 # make inputs between 1 and negative 1 dont have to
    outputs = []
    X_max = np.max(X,axis=1)
    X_min = np.min(X,axis=1)
    convolutions = []
    for kernel_size in kernel_sizes:
        conv_output = make_conv_feat(X, kernel_size) # Convolutions kernels are the different window sizes 
        convolutions.append(conv_output)
    convolutions.extend([X_max,X_min])
    X_processed = np.hstack(convolutions)
    
    return X_processed

#print(feature_eng(X_test))

feature_eng(X_test).shape

(8000, 96)

In [39]:
X_train_processed = feature_eng(X)
#X_test_processed = feature_eng(X_test)

## Model training

In [15]:
diseases = y.columns.tolist()
print(diseases)


['1dAVb', 'RBBB', 'LBBB', 'SB', 'AF', 'ST']


In [16]:
#multi label classification
#for x in y.columns:

RFmodel = RF(n_estimators=100)
RFmodel.fit(X_train_processed,y)
y_train_pred = RFmodel.predict(X_train_processed)
#y_test_pred = RFmodel.predict(X_test_processed)



#print(RFmodel.score(X_train_processed,y_train[feature_column]))
#print(RFmodel.score(X_test_processed,y_test[feature_column]))

# 7 features 6 deseases and healthy vs unhealthy
# all trained seperately 


1




y_train_preds = []
y_test_preds = []

for x in y.columns:

    LRmodel = LR()
    LRmodel.fit(X_train_processed,y_train[x])
    y_train_pred = LRmodel.predict(X_train_processed)
    y_test_pred = LRmodel.predict(X_test_processed)
    y_train_preds.append(y_train_pred)
    y_test_preds.append(y_test_pred)


print(len(y_test_preds))
#print(LRmodel.score(X_train_processed,y_train[feature_column]))
#print(LRmodel.score(X_test_processed,y_test[feature_column]))

# 7 features 6 deseases and healthy vs unhealthy
# all trained seperately 

## Metrics

In [40]:
alg = "RNF"
data = "eval"

In [41]:
y_pred = RFmodel.predict(X_train_processed)

In [36]:
def produce_hyp_files(y_pred, y_test, alg,data):
  df = pd.DataFrame(columns=diseases)
  for j,dis in enumerate(diseases):
      df[dis] = y_pred[:,j]
  df.to_csv(f'hyps/hyp_{alg}_{data}.csv',index=False)
  y_test.to_csv(f'hyps/ref_{alg}_{data}.csv',index=False)
  #python score.py preds.csv labels.csv

produce_hyp_files(y_pred,y,alg,data)


In [42]:
def produce_hyp_files(y_pred, alg,data):
  df = pd.DataFrame(columns=diseases)
  for j,dis in enumerate(diseases):
      df[dis] = y_pred[:,j]
  df.to_csv(f'hyps/hyp_{alg}_{data}.csv',index=False)
  #python score.py preds.csv labels.csv

produce_hyp_files(y_pred,alg,data)

df = pd.DataFrame(columns=diseases)
for j,dis in enumerate(diseases):
    df[dis] = y_test_preds[j]
df.to_csv('preds.csv',index=False)
y_test.to_csv("labels.csv",index=False)
#python score.py preds.csv labels.csv

In [37]:
import subprocess
ref = "/home/tud03114/finalproject/hyps/ref_RNF_dev.csv" 
hyp = "/home/tud03114/finalproject/hyps/hyp_RNF_dev.csv"

subprocess.run(["python", "score.py",ref,hyp]) 

Metric 1: simple accuracy
 err / acc = 0.3164 / 0.6836

Metric 2: micro accuracy / precision / recall / f1
 micro acc / prec / rec / f1 = 0.9279 / 0.9184 / 0.3547 / 0.5118

Metric 3: macro accuracy / precision / recall / f1
  [1dAVb] acc / prec / rec / f1 = 0.9120 / 0.7500 / 0.0017 / 0.0034
  [RBBB] acc / prec / rec / f1 = 0.9548 / 0.9349 / 0.8021 / 0.8634
  [LBBB] acc / prec / rec / f1 = 0.9661 / 0.8950 / 0.7042 / 0.7883
  [SB] acc / prec / rec / f1 = 0.9257 / 0.8961 / 0.2045 / 0.3330
  [AF] acc / prec / rec / f1 = 0.8970 / 0.7755 / 0.0182 / 0.0356
  [ST] acc / prec / rec / f1 = 0.9120 / 0.7500 / 0.0017 / 0.0034
 macro acc / prec / rec / f1 = 0.9279 / 0.8336 / 0.2887 / 0.3378



CompletedProcess(args=['python', 'score.py', '/home/tud03114/finalproject/hyps/ref_RNF_dev.csv', '/home/tud03114/finalproject/hyps/hyp_RNF_dev.csv'], returncode=0)