malware-analysis/read_dataset.py at master · elsheikh21/malware-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
first things first, we build our dataset
1. Download Drebin dataset
2. Read csv file to get to label your dataset units which is a malware
    and which is not
3. Get all files in the feature_vectors folder
4. label them {1|0} according to their classification {malware|not malware},
    thus we have the features and number of occurrences which will help us
    select which features to be chosen for training our model
'''

import pandas as pd
import os
import features_extraction
import numpy as np


def read_data():

    print('\nReading data from CSV file...')

    # Read data from file 'sha256_family.csv'
    malwares = pd.read_csv('drebin\sha256_family.csv', dtype=str)

    print('Found (' + str(len(malwares.index)) + ') malwares in csv file.')

    print('Reading dataset files...')

    # Read all the files in the feature vector path specified path
    data_path = os.path.join(os.getcwd(), 'drebin', 'feature_vectors')
    features_vector_path = data_path
    dataset_files = os.listdir(features_vector_path)

    dataset_files_length = len(dataset_files)
    print('Found (' + str(dataset_files_length) + ') files to classify.')

    # Separate malwares from non-malwares [Building ground truth arrays]
    malware_files = []
    not_malware_files = []
    for file_name in dataset_files:
        if file_name in (malwares.values[:, 0]):
            malware_files.append(file_name)
        else:
            not_malware_files.append(file_name)

    malware_files_length = len(malware_files)
    not_malware_files_length = len(not_malware_files)
    print('Found (' + str(malware_files_length) + ') malware files.')
    print('Found (' + str(not_malware_files_length) + ') safe files.')

    # Extract features from  dataset files, and label them
    # 1 for malware, 0 otherwise
    # x = {set of features}, y = {0|1}
    x = []
    y = []

    # extract features occurrences in malware files
    for malware_file in malware_files:
        with open(data_path + '/' + malware_file, 'r') as file:
            file_content = file.read().splitlines()
            sample = features_extraction.extract_features(file_content)
            x.append(sample)
            y.append(1)

    # extract features occurrences in safe (non malware) files
    counter = 1  # remove this to work with unbalanced dataset
    for non_malware_file in not_malware_files:
        # remove the following lines to work with unbalanced dataset
        counter += 1
        if(counter == malware_files_length):
            break
        else:
            # remove lines up to here
            with open(data_path + '/' + non_malware_file, 'r') as file:
                file_content = file.read().splitlines()
                sample = features_extraction.extract_features(file_content)
                x.append(sample)
                y.append(0)

    x = np.array(x)
    y = np.array(y)
    print("\nFeatures & Labels arrays' shapes, respectively: " +
          str(x.shape), str(y.shape))
    return x, y