yes sure, I also feel something fishy is going on here, Thanks for your help, it is much appreciated .
Start of code
import cPickle as pickle
import shutil
from nupic.serializable import Serializable
# The anomaly calcuation modules
from nupic.algorithms.anomaly import Anomaly
from nupic.algorithms import anomaly_likelihood as AL
# These are the python implementation of the spatial pooler and the temporal memory
# from nupic.research.temporal_memory import TemporalMemory as TM
# from nupic.research.spatial_pooler import SpatialPooler as SP
# These are the same but the faster, C++ implementation
from nupic.bindings.algorithms import SpatialPooler as SP
from nupic.bindings.algorithms import TemporalMemory as TM
# The encoders
# from nupic.encoders import ScalarEncoder
from nupic.encoders import SDRCategoryEncoder
from nupic.encoders.date import DateEncoder
from nupic.encoders import random_distributed_scalar
# Can't live without Numpy
import numpy as np
# The standard python datetime module
import datetime
# The standard python csv module
import csv
import glob
import os
# Helper function to find the index of an active bit
def find_idxs(li):
return [i for i, x in enumerate(li) if int(x) == 1]
# Helper function to get an SDR out of the temporal memory
def get_sdr(tm_cellsPerCol, cells):
return set([x/tm_cellsPerCol for x in cells])
# This dictonary holds the model parameters including the spatial pooler, temporal memory and the encoders parameters.
settings = {
'sp': SP(inputDimensions=(884,),
columnDimensions=(2048,),
synPermConnected=0.1,
numActiveColumnsPerInhArea=40.0, ### framework_v11 - 2% of 2048
boostStrength = 3.0,
synPermActiveInc=0.04,
synPermInactiveDec=0.005,
globalInhibition=True,
potentialPct=0.85,
seed=1956,
wrapAround= False),
'tm': TM(columnDimensions = (2048,),
cellsPerColumn= 32,
initialPermanence=0.21,
##added
connectedPermanence=0.5,
maxNewSynapseCount=20,
permanenceIncrement=0.1,
permanenceDecrement=0.1,
minThreshold=2,
activationThreshold=16,
maxSegmentsPerCell=128,
maxSynapsesPerSegment=32,
#####added
predictedSegmentDecrement=0.0,
seed=1960),
#'w': 3,
'n': 2048, ### framework_v8_v1 - this value is changed to match columnDimensions
'encoder_n': 6,
'timeOfDay': (1, 6)
}
# This class is our HTM/CLA model consisting of several encoders for each column, a spatial pooler and a temporal memory
class HTMModel_score(object):
# initalizing
prevPredictedColumns = np.array([])
anomaly = Anomaly(slidingWindowSize=None, mode='pure', binaryAnomalyThreshold=None)
# This class constructor takes the dataset lenght (datasetLen) so it can calculate the moving average needed for the anomaly score calculation
def __init__(self, datasetLen,filename):
# initalizing these variables based on the parameter dictionary
self.n = settings['n']
self.activeColumns = []
self.sp = settings['sp']
self.tm = settings['tm']
self.date_sdr = []
datasetLen = datasetLen
estimationSamples = int(datasetLen * 0.1)
learningPeriod = int(datasetLen * 0.2)
historicWindowSize = int(datasetLen * 0.2)
##must read >>> https://github.com/numenta/nupic/blob/50c5fd0dc94f2ffb205544ed11fe82ad5bb0de18/src/nupic/algorithms/anomaly_likelihood.py#L154-L155
self.al = AL.AnomalyLikelihood(
learningPeriod = learningPeriod, # If you get an error about this, try to change it to (LearningPeriod = learningPeriod), they changed the name of this argument in later versions of NuPIC
estimationSamples = estimationSamples,
historicWindowSize = historicWindowSize,
reestimationPeriod = estimationSamples
)
# This method takes a row of the dataset and the row number (i)
def compute(self, i, row,learn=True, infer=True):
# Getting some needed data out of the dataset
print row[0]
hr = row[4]
bps = row[9]
bpd = row[14]
spo = row[19]
#####################
sensor_hr_score = row[1]
sensor_hr_log = row[3]
sensor_bps_score = row[6]
sensor_bps_log = row[8]
sensor_bpd_score = row[11]
sensor_bpd_log = row[13]
sensor_spo_score = row[16]
sensor_spo_log = row[18]
p_service = row[20]
c_service = row[21]
date = row[0]
# Creating the date SDR
de = DateEncoder(timeOfDay=settings['timeOfDay'])
now = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
#now = (h - h.replace(hour=0,minute=0,second=0)).seconds / 3600
sdr_date = de.encode(now)
self.date_sdr = sdr_date
###Ben _ I changed the n , I added W and I changed the resplution
#encoder_rds = random_distributed_scalar.RandomDistributedScalarEncoder(n= 200,resolution=0.01)
## 100 bits with 5 active with buckets of size 0.1
encoder_rds = random_distributed_scalar.RandomDistributedScalarEncoder(n= 220, w=21,resolution=0.1)
rds_hr_score = encoder_rds.encode(float(sensor_hr_score))
rds_bps_score = encoder_rds.encode(float(sensor_bps_score))
rds_bpd_score = encoder_rds.encode(float(sensor_bpd_score))
rds_spo_score = encoder_rds.encode(float(sensor_spo_score))
## # Then, we concatenate the previous big SDR with and date sdr.
sdr = np.concatenate((sdr_date,rds_hr_score,rds_bps_score,rds_bpd_score,rds_spo_score))
# Creating an empty active array to store the output of the spatial pooler
activeArray = np.zeros(self.n, dtype="uint32")
##print activeArray
# Feeding the SDR to the spatial pooler and the boolean flag indicates that we want the spatial pooler to learn from this. The output of this is stored in the activeArray
self.sp.compute(sdr, True, activeArray)
##print activeArray
# The activeArray is a binary vector, so to get the actual indices of the active bits, we use this helper function
self.activeColumns = set(find_idxs(activeArray))
##print self.activeColumns
# Then we feed that to the temporal memory. The temporal memory will not output anyting, be we can get the active cells and the predictive cells later on from this `self.tm` object
self.tm.compute(self.activeColumns, learn=True)
# This calculates the raw anomaly score
anomalyScore = self.anomaly.compute(list(self.activeColumns), list(self.prevPredictedColumns))
# getting the predictive cells and converting them to an SDR and storing it in the variable `self.prevPredictedColumns`
predictedColumns = get_sdr(self.tm.getCellsPerColumn(), self.tm.getPredictiveCells())
self.prevPredictedColumns = set(predictedColumns)
# Calculating the likelihood probablity of the anomaly score
likeScore = self.al.anomalyProbability(sdr, anomalyScore, date)
# Calculating the log likelihood probablity of the anomaly score
logScore = self.al.computeLogLikelihood(likeScore)
# We have 3 anomaly metrics we can use and experiment with: the raw anomaly score, and the likelihood probability and the log of that.
# From my experience the loglikelihood is the best one. However, this might be dependant on the dataset.
# Finally we return these scores with the actual label.
return (date, round(float(anomalyScore),2), round(float(likeScore),2), round(float(logScore),2),p_service,c_service,hr,bps,bpd,spo)
def main():
# with open("C:\Framework_V11\Framework\step_2.pkl") as f:
# model = pickle.load(f)
datasetfile = glob.glob(r'C:\Framework_V11\Framework\step_two\scores_combined_step_three\*.csv')
for file in datasetfile:
head, tail = os.path.split(file)
with open(file) as f:
datasetLen = len(f.readlines()) - 1
reader = csv.reader(open(file, 'r'))
next(reader) # skipping the header row
# Here we create an instance of our HTM model
model = HTMModel_score(datasetLen,filename=tail)
filename_to_write = tail.replace('.csv', '')
print filename_to_write
writer = csv.writer(open("C:\Framework_V11\Framework\step_two\scores_combined_step_five_anomalyscore_only\\"+filename_to_write+".csv", 'wb'))
writer.writerow( ('timestamp', 'anomalyScore', 'anomalyLikelihood', 'logLikelihood','p_service','c_service', 'hr','bps','bpd','spo2') )
# reading the dataset row by row and feeding the rows to the model
for i, row in enumerate(reader, start=1):
# feeding the model the row number and the row content
#print row[0]
result = model.compute(i, row)
# writing the results
writer.writerow(result)
with open("C:\Framework_V11\Framework\step_3.pkl","w") as f:
pickle.dump(model,f)
if __name__ == '__main__':
main()