-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathread_dataset.py
More file actions
84 lines (69 loc) · 2.93 KB
/
read_dataset.py
File metadata and controls
84 lines (69 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
first things first, we build our dataset
1. Download Drebin dataset
2. Read csv file to get to label your dataset units which is a malware
and which is not
3. Get all files in the feature_vectors folder
4. label them {1|0} according to their classification {malware|not malware},
thus we have the features and number of occurrences which will help us
select which features to be chosen for training our model
'''
import pandas as pd
import os
import features_extraction
import numpy as np
def read_data():
print('\nReading data from CSV file...')
# Read data from file 'sha256_family.csv'
malwares = pd.read_csv('drebin\sha256_family.csv', dtype=str)
print('Found (' + str(len(malwares.index)) + ') malwares in csv file.')
print('Reading dataset files...')
# Read all the files in the feature vector path specified path
data_path = os.path.join(os.getcwd(), 'drebin', 'feature_vectors')
features_vector_path = data_path
dataset_files = os.listdir(features_vector_path)
dataset_files_length = len(dataset_files)
print('Found (' + str(dataset_files_length) + ') files to classify.')
# Separate malwares from non-malwares [Building ground truth arrays]
malware_files = []
not_malware_files = []
for file_name in dataset_files:
if file_name in (malwares.values[:, 0]):
malware_files.append(file_name)
else:
not_malware_files.append(file_name)
malware_files_length = len(malware_files)
not_malware_files_length = len(not_malware_files)
print('Found (' + str(malware_files_length) + ') malware files.')
print('Found (' + str(not_malware_files_length) + ') safe files.')
# Extract features from dataset files, and label them
# 1 for malware, 0 otherwise
# x = {set of features}, y = {0|1}
x = []
y = []
# extract features occurrences in malware files
for malware_file in malware_files:
with open(data_path + '/' + malware_file, 'r') as file:
file_content = file.read().splitlines()
sample = features_extraction.extract_features(file_content)
x.append(sample)
y.append(1)
# extract features occurrences in safe (non malware) files
counter = 1 # remove this to work with unbalanced dataset
for non_malware_file in not_malware_files:
# remove the following lines to work with unbalanced dataset
counter += 1
if(counter == malware_files_length):
break
else:
# remove lines up to here
with open(data_path + '/' + non_malware_file, 'r') as file:
file_content = file.read().splitlines()
sample = features_extraction.extract_features(file_content)
x.append(sample)
y.append(0)
x = np.array(x)
y = np.array(y)
print("\nFeatures & Labels arrays' shapes, respectively: " +
str(x.shape), str(y.shape))
return x, y