Python-Machine-Learning/Classification/random_forest.py at master · GeorgeSeif/Python-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import ml_helpers
import numpy as np
from sklearn import datasets, decomposition, ensemble
from decision_tree import ClassificationTree


class RandomForest():
    # Random Forest classifier. Uses a collection of classification trees that
    # trains on random subsets of the data using a random subsets of the features.
    # Parameters:
    # -----------
    # n_estimators: int
    #     The number of classification trees that are used.
    # max_features: int
    #     The maximum number of features that the classification trees are allowed to
    #     use.
    # min_samples_split: int
    #     The minimum number of samples needed to make a split when building a tree.
    # min_gain: float
    #     The minimum impurity required to split the tree further.
    # max_depth: int
    #     The maximum depth of a tree.
    # debug: boolean
    #     True or false depending on if we wish to display the training errors.

    def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
                 min_gain=1e-7, max_depth=float("inf"), debug=False):
        self.n_estimators = n_estimators    # Number of trees
        self.max_features = max_features    # Maxmimum number of features per tree
        self.feature_indices = []           # The indices of the features used for each tree
        self.min_samples_split = min_samples_split
        self.min_gain = min_gain            # Minimum information gain req. to continue
        self.max_depth = max_depth          # Maximum depth for tree
        self.debug = debug

        # Initialize decision trees
        self.trees = []
        for _ in range(n_estimators):
            self.trees.append(ClassificationTree(min_samples_split=self.min_samples_split, min_impurity=min_gain, max_depth=self.max_depth))

    def fit(self, X, y):
        n_features = np.shape(X)[1]
        # If max_features have not been defined => select it as
        # sqrt(n_features)
        if not self.max_features:
            self.max_features = int(np.sqrt(n_features))

        if self.debug:
            print ("Training (%s estimators):" % (self.n_estimators))
        # Choose one random subset of the data for each tree
        subsets = ml_helpers.get_random_subsets(X, y, self.n_estimators)
        for i in range(self.n_estimators):
            X_subset, y_subset = subsets[i]
            # Feature bagging (select random subsets of the features)
            idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
            # Save the indices of the features for prediction
            self.feature_indices.append(idx)
            # Choose the features corresponding to the indices
            X_subset = X_subset[:, idx]
            # Fit the tree to the data
            self.trees[i].fit(X_subset, y_subset)

            if self.debug:
                progress = 100 * (i / self.n_estimators)
                print ("Progress: %.2f%%" % progress)

    def predict(self, X):
        y_preds = []
        # Let each tree make a prediction on the data
        for i, tree in enumerate(self.trees):
            # Select the features that the tree has trained on
            idx = self.feature_indices[i]
            # Make a prediction based on those features
            prediction = tree.predict(X[:, idx])
            y_preds.append(prediction)

        # Take the transpose of the matrix to transform it so
        # that rows are samples and columns are predictions by the
        # estimators
        y_preds = np.array(y_preds).T
        y_pred = []
        # For each sample
        for sample_predictions in y_preds:
            # Do a majority vote over the predictions (columns)
            max_count = 0
            most_common = None
            # For each unique predicted label -> count occurences
            # and save the most predicted label
            for label in np.unique(sample_predictions):
                count = len(sample_predictions[sample_predictions == label])
                if count > max_count:
                    max_count = count
                    most_common = label
            # The most common prediction gets added as final prediction
            # of the sample
            y_pred.append(most_common)
        return y_pred

# **************************************************************
# Apply the Random Forest for Classification Manually
# **************************************************************
# Get the training data
# Import the Iris flower dataset
iris = datasets.load_iris()
train_data = np.array(iris.data)
train_labels = np.array(iris.target)
num_features = train_data.data.shape[1]

# Randomly shuffle the data
train_data, train_labels = ml_helpers.shuffle_data(train_data, train_labels)

# Apply PCA to the data to reduce its dimensionality
pca = decomposition.PCA(n_components=4)
pca.fit(train_data)
train_data = pca.transform(train_data)


X_train, X_test, y_train, y_test = ml_helpers.train_test_split(train_data, train_labels, test_size=0.4)

clf = RandomForest()

clf.fit(X_train, y_train)

predicted_labels = clf.predict(X_test)

# Compute the testing accuracy
Accuracy = 0
for index in range(len(predicted_labels)):
    current_label = y_test[index]
    predicted_label = predicted_labels[index]

    if current_label == predicted_label:
        Accuracy += 1

Accuracy /= len(train_labels)

# Print stuff
print("Manual Random Forest Classification Accuracy = ", Accuracy)

# **************************************************************
# Apply the Random Forest for Classification using Sklearn
# **************************************************************

clf = ensemble.RandomForestClassifier(n_estimators=10, criterion="gini")

clf.fit(X_train, y_train)

predicted_labels = clf.predict(X_test)

# Compute the testing accuracy
Accuracy = 0
for index in range(len(predicted_labels)):
    current_label = y_test[index]
    predicted_label = predicted_labels[index]

    if current_label == predicted_label:
        Accuracy += 1

Accuracy /= len(train_labels)

# Print stuff
print("Sklearn Random Forest Classification Accuracy = ", Accuracy)