Using scikit learn to fit a classifier
from sklearn.neighbors import KNeighborsClassifier
x = churn_df{{"total_day_charge", "total_eve_charge"]].values
y = churn_df["churn"].values
print(X.shape, y.shape)
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X, y)
X_new = np.array([[56.8, 17.5],
[24.4, 24.1],
[50.1, 10.9]])
pring(X_new.shape)
#(3, 2)
predictions = knn.predict(x_new)
print('predictions: {}'.format(prediction))
#Predictions: [1 0 0]
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
y = churn_df["churn"].values
X = churn_df[["account_length", "customer_service_calls"]].values
# Create a KNN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)
# Fit the classifier to the data
knn.fit(X, y)
# Predict the labels for the X_new
y_pred = knn.predict(X_new)
# Print the predictions
print("Predictions: {}".format(y_pred))
How do we measure accuracy?
could compute accuracy on the data used to fit the classifier
Not indicative of ability to generalize
Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state = 21, stratify = y)
knn = KNeightborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
train test split returns four arrays: the training data, the test data, the training labels, and the test labels.