首页 > 解决方案 > Python - 使用 numpy 数组的新分类变量选择新预测时遇到问题

问题描述

我创建了一个具有 4 个特征的人工神经网络。我现在想用恶意文件路径/exe的实时样本测试模型,使用:

new_prediction = classifier.predict(sc.transform(np.array([[]])))

我知道,如果我使用相同的文件路径,如“C:\Program Files (x86\Wireless AutoSwitch”),我可以对已经通过 onehot 和标签编码的每个分类特征使用“0”等。如何做你处理一个不在训练集数组中的新分类特征?假设我要测试的新特征是:

   ParentPath            ParentExe     ChildPath           ChildExe
0  C:\Windows\Malicious  badscipt.exe  C:\Windows\System   cmd.exe  

此训练数据集如下所示:

    ParentPath                                  ParentExe
0   C:\Program Files (x86)\Wireless AutoSwitch  wrlssw.exe
1   C:\Program Files (x86)\Wireless AutoSwitch  WrlsAutoSW.exs
2   C:\Program Files (x86)\Wireless AutoSwitch  WrlsAutoSW.exs
3   C:\Windows\System32                         svchost.exe
4   C:\Program Files (x86)\Wireless AutoSwitch  WrlsAutoSW.exs

ChildPath                                   ChildExe
C:\Windows\System32                         conhost.exe
C:\Program Files (x86)\Wireless AutoSwitch  wrlssw.exe
C:\Program Files (x86)\Wireless AutoSwitch  wrlssw.exe
C:\Program Files\Common Files               OfficeC2RClient.exe
C:\Program Files (x86)\Wireless AutoSwitch  wrlssw.exe
C:\Program Files (x86)\Wireless AutoSwitch  wrlssw.exe

编码:

#Libraries
import pandas as pd
import numpy as np
import hashlib
import matplotlib.pyplot as plt
import timeit

#################### GOOD ###################
#Read in csv to df
DF = pd.read_csv('/home/gpubetterwork/Documents/Good-Merged-TAGS_8-23- 
2018_060000-95959_TAG_Parent_Child.csv')
#Select 2 columns
DF1 = DF[['filePath', 'destinationProcessName']]
#Rename columns
DF1.columns = ['ParentPathExe', 'ChildPathExe']
#Replace all NaN with Unknown
DF1['ParentPathExe'] = DF1['ParentPathExe'].replace(np.nan, 'UNKNOWN')
DF1['ChildPathExe'] = DF1['ChildPathExe'].replace(np.nan, 'UNKNOWN')
#Split ParentPathExe into path and exe columns
DParent = DF1['ParentPathExe'].str.rsplit("\\", n=1, expand=True)
#Rename columns
DParent.columns = ['ParentPath', 'ParentExe']
#Split ChildPathExe into path and exe columns
DChild = DF1['ChildPathExe'].str.rsplit("\\", n=1, expand=True)
#Rename columns
DChild.columns = ['ChildPath', 'ChildExe']
#Merge the two dataframes together
DF1 = pd.concat([DParent, DChild], axis = 1)
#Fill new column DependentVariable with 0's
DF1['Suspicous'] = 0

####################### BAD ######################
BF = pd.read_csv('/home/gpubetterwork/Documents/4688_events_PC- 
Tags_last_7_days_BAD2.csv')
#Select 2 columns
BF1 = BF[['filePath', 'destinationProcessName']]
#Rename columns
BF1.columns = ['ParentPathExe', 'ChildPathExe']
#Replace all NaN with Unknown
BF1['ParentPathExe'] = BF1['ParentPathExe'].replace(np.nan, 'UNKNOWN')
BF1['ChildPathExe'] = BF1['ChildPathExe'].replace(np.nan, 'UNKNOWN')
#Split ParentPathExe into path and exe columns
BParent = BF1['ParentPathExe'].str.rsplit("\\", n=1, expand=True)
#Rename columns
BParent.columns = ['ParentPath', 'ParentExe']
#Split ChildPathExe into path and exe columns
BChild = BF1['ChildPathExe'].str.rsplit("\\", n=1, expand=True)
#Rename columns
BChild.columns = ['ChildPath', 'ChildExe']
#Merge the two dataframes together
BF1 = pd.concat([BParent, BChild], axis = 1)
#Fill new column DependentVariable with 1's
BF1['Suspicous'] = 1

############# MERGE GOOD AND BAD DATAFRAMES ###########
#Merge the two dataframes
DBF1 = DF1.append(BF1)
#Reset index
DBF1 = DBF1.reset_index(drop=True)  
#Randomize rows
DBF2 = DBF1.sample(frac=1).reset_index(drop=True)

############### ARTIFICIAL NEURAL NETWORK ##############
#TIME THE NEURAL NETWORK
start_time = timeit.default_timer()

#STEP 1
#Import the dataset
X = DBF2.iloc[:, 0:4].values
#X = DBF2[['ParentProcess', 'ChildProcess']]
y = DBF2.iloc[:, 4].values#.ravel()

#Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#Label Encode Parent Path
labelencoder_X_1 = LabelEncoder()
X[:, 0] = labelencoder_X_1.fit_transform(X[:, 0])
#Label Encode Parent Exe
labelencoder_X_2 = LabelEncoder()
X[:, 1] = labelencoder_X_2.fit_transform(X[:, 1])
#Label Encode Child Path
labelencoder_X_3 = LabelEncoder()
X[:, 2] = labelencoder_X_3.fit_transform(X[:, 2])
#Label Encode Child Exe
labelencoder_X_4 = LabelEncoder()
X[:, 3] = labelencoder_X_4.fit_transform(X[:, 3])

#Create dummy variables
onehotencoder = OneHotEncoder(categorical_features = [0,1,2,3])
X = onehotencoder.fit_transform(X)

index_to_drop = [0, 1627, 2292, 5922]
to_keep = list(set(xrange(X.shape[1]))-set(index_to_drop))
X = X[:,to_keep]

#Splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train_sc = sc.fit(X_train)
X_train = X_train_sc.transform(X_train)
X_test = X_train_sc.transform(X_test)

#STEP 2
#Make the ANN
import keras
from keras.models import Sequential
from keras.layers import Dense

#Initialising the ANN
classifier = Sequential()
#Adding the input layer and the first hidden layer
classifier.add(Dense(units=3678, kernel_initializer='uniform', 
activation='relu', input_dim=7356))
#Adding a second hidden layer
classifier.add(Dense(units=3678, kernel_initializer='uniform', 
activation='relu'))
#Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', 
activation='sigmoid'))
#Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Fitting the ANN to the training set
classifier.fit(X_train, y_train, batch_size=1000, epochs=10)                           

#STEP 3
#Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)



##### NEW PREDICTION #####
#Must be in an array
new_prediction = classifier.predict(sc.transform(np.array([[]])))
new_prediction = (new_prediction > 0.5)

标签: pythonnumpyneural-networkkeras

解决方案


使用谷歌的 word2vec

#Merge the two dataframes
DBF1 = DF1.append(BF1)
#Reset index
DBF1 = DBF1.reset_index(drop=True) 
#Randomize rows
DBF2 = DBF1.sample(frac=1).reset_index(drop=True)
#Replace \ with space
DBF2['ParentProcess'] = DBF2['ParentProcess'].str.replace("\\", " ")
DBF2['ChildProcess'] = DBF2['ChildProcess'].str.replace("\\", " ")

#Create a new column combining Parent and Child
DBF2['New'] = DBF2.ParentProcess.astype(str).str.cat(DBF2.ChildProcess.astype(str), sep=' ')
#Create a new list from DBF2['New']
NewList = DBF2['New'].tolist()
#Create a np array from Suspicous column
SuspiciousLabels = DBF2['Suspicous'].values

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

#Define documents
docs = NewList
#Define class labels
labels = SuspiciousLabels
#Integer encode the documents
vocab_size = 2000
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
#Pad documents to a max length of 40 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
#Define the model
classifier = Sequential()
#Embedding
classifier.add(Embedding(vocab_size, 8, input_length=max_length))
#Flatten
classifier.add(Flatten())
#Output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
#Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#Fitting the ANN to the training set
classifier.fit(padded_docs, labels, batch_size=1000, epochs=10)                               

#Create data frame with malicous test
testmalicious = {'ParentProcess': ['C:\\Windows\\System32\\services.exe'], 'ChildProcess': ['C:\\Windows\\System32\\svch0st.exe']}
testmaliciousdf = pd.DataFrame(data=testmalicious)
testmaliciousdf = testmaliciousdf[['ParentProcess', 'ChildProcess']]
#Replace \ with space
testmaliciousdf['ParentProcess'] = testmaliciousdf['ParentProcess'].str.replace("\\", " ")
testmaliciousdf['ChildProcess'] = testmaliciousdf['ChildProcess'].str.replace("\\", " ")
#Create a new column combining Parent and Child
testmaliciousdf['New'] = testmaliciousdf.ParentProcess.astype(str).str.cat(testmaliciousdf.ChildProcess.astype(str), sep=' ')
#Create a new list from DBF2['New']
testmaliciousNewList = testmaliciousdf['New'].tolist()
#Encode
testmalicious_encoded_docs = [one_hot(d, vocab_size) for d in testmaliciousNewList]
testmalicious_padded_docs = pad_sequences(testmalicious_encoded_docs, maxlen=max_length, padding='post')
#Predict if it is Suspicious(TRUE) or Not Suspicious(FALSE)
bad_new_prediction_percent = classifier.predict(testmalicious_padded_docs)
bad_new_prediction = (bad_new_prediction_percent > 0.1)
print bad_new_prediction

推荐阅读