import numpy as np
= ["positive", "negative", "negative", "positive", "positive", "positive", "negative", "positive", "negative", "positive", "positive", "positive", "positive", "negative", "negative", "negative"]
y_true
= [0.7, 0.3, 0.5, 0.6, 0.55, 0.9, 0.4, 0.2, 0.4, 0.3, 0.7, 0.5, 0.8, 0.2, 0.3, 0.35]
pred_scores
= np.arange(start=0.2, stop=0.7, step=0.05) thresholds
When a model has high recall but low precision, then the model classifies most of the positive samples correctly but it has many false positives (i.e. classifies many Negative samples as Positive). When a model has high precision but low recall, then the model is accurate when it classifies a sample as Positive but it may classify only some of the positive samples.
Note that as the recall increases, the precision decreases. The reason is that when the number of positive samples increases (high recall), the accuracy of classifying each sample correctly decreases (low precision). This is expected, as the model is more likely to fail when there are many samples.
import sklearn.metrics
def precision_recall_curve(y_true, pred_scores, thresholds):
= []
precisions = []
recalls
for threshold in thresholds:
= ["positive" if score >= threshold else "negative" for score in pred_scores]
y_pred
= sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
precision = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
recall
precisions.append(precision)
recalls.append(recall)
return precisions, recalls
# when threshold(0.2) is low, all predicted samples become positive and it will definetly include actual positives
# so recall will be high but precision will be low because total predicted positives are more than actual positives
= ["positive" if score >= 0.2 else "negative" for score in pred_scores]
y_pred = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
precision = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
recall print(f'Precision {precision}, Recall {recall}')
Precision 0.5625, Recall 1.0
# when threshold(0.9) is high, positive samples will be actual positive samples so they become true positive which makes precision high
# that is the model is more than 0.9 sure that the sample is positive, so it will be an actual positive but still you are not covering all
# positives because of keeping high threshold
= ["positive" if score >= 0.9 else "negative" for score in pred_scores]
y_pred = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
precision = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
recall print(f'Precision {precision}, Recall {recall}')
Precision 1.0, Recall 0.1111111111111111
= precision_recall_curve(y_true, pred_scores, thresholds)
precisions, recalls print(f'Precision list {precisions}')
print(f'Recall list {recalls}')
Precision list [0.5625, 0.5714285714285714, 0.5714285714285714, 0.6363636363636364, 0.7, 0.875, 0.875, 1.0, 1.0, 1.0]
Recall list [1.0, 0.8888888888888888, 0.8888888888888888, 0.7777777777777778, 0.7777777777777778, 0.7777777777777778, 0.7777777777777778, 0.6666666666666666, 0.5555555555555556, 0.4444444444444444]
import matplotlib.pyplot as plt
="red")
plt.plot(recalls, precisions, color"Recall")
plt.xlabel("Precision")
plt.ylabel("Precision-Recall Curve obtained by varying score threshold \n score threshold is increasing from right to left")
plt.title( plt.show()
Similarly, if we vary IOU threshold then we will get another precision recall curve
def compute_ap(recall, precision):
#from ultralytics
# Append sentinel values to beginning and end
= np.concatenate(([0.0], recall, [1.0]))
mrec = np.concatenate(([1.0], precision, [0.0]))
mpre print(f'mpre {mpre}')
print(f'np.flip {np.flip(mpre)}')
print(f'np.accumulate.maximum {np.maximum.accumulate(np.flip(mpre))}')
# Compute the precision envelope
= np.flip(np.maximum.accumulate(np.flip(mpre)))
mpre
# Integrate area under curve
= "interp" # methods: 'continuous', 'interp'
method if method == "interp":
= np.linspace(0, 1, 101) # 101-point interp (COCO)
x = np.trapz(np.interp(x, mrec, mpre), x) # integrate
ap else: # 'continuous'
= np.where(mrec[1:] != mrec[:-1])[0] # points where x-axis (recall) changes
i = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve
ap
return ap, mpre, mrec
def smooth(y, f=0.05):
"""Box filter of fraction f."""
= round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd)
nf = np.ones(nf // 2) # ones padding
p = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded
yp return np.convolve(yp, np.ones(nf) / nf, mode="valid") # y-smoothed
def ap_per_class(
=False,eps=1e-16):
tp, conf, pred_cls, target_cls, plot
# Sort by objectness
= np.argsort(-conf)
i = tp[i], conf[i], pred_cls[i] # decreasing order of confidences
tp, conf, pred_cls = tp.reshape((tp.shape[0],1))
tp # Find unique classes
= np.unique(target_cls, return_counts=True)
unique_classes, nt = unique_classes.shape[0] # number of classes, number of detections
nc
# Create Precision-Recall curve and compute AP for each class
= np.linspace(0, 1, 1000), []
x, prec_values
# Average precision, precision and recall curves
= np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
ap, p_curve, r_curve for ci, c in enumerate(unique_classes):
= pred_cls == c
i = nt[ci] # number of labels or ground truth
n_l = i.sum() # number of predictions
n_p if n_p == 0 or n_l == 0:
continue
# Accumulate FPs and TPs
= (1 - tp[i]).cumsum(0)
fpc = tp[i].cumsum(0)
tpc
# Recall
= tpc / (n_l + eps) # recall curve
recall print(f'recall {recall[:,0]}')
# print(f'-conf[i] {-conf[i]}')
= np.interp(-x, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases
r_curve[ci] # print(f'r_curve[ci] {r_curve[ci]}')
# Precision
= tpc / (tpc + fpc) # precision curve
precision print(f'precision {precision[:,0]}')
# print(f'-conf[i] {-conf[i]}')
= np.interp(-x, -conf[i], precision[:, 0], left=1) # p at pr_score
p_curve[ci] # print(f'p_curve[ci] {p_curve[ci]}')
# AP from recall-precision curve
for j in range(tp.shape[1]):
= compute_ap(recall[:, j], precision[:, j])
ap[ci, j], mpre, mrec print(f'AP {ap[ci,j]}')
if plot and j == 0:
prec_values.append(np.interp(x, mrec, mpre))
= np.array(prec_values) # (nc, 1000)
prec_values
# Compute F1 (harmonic mean of precision and recall)
= 2 * p_curve * r_curve / (p_curve + r_curve + eps)
f1_curve
= smooth(f1_curve.mean(0), 0.1).argmax() # max F1 index
i = p_curve[:, i], r_curve[:, i], f1_curve[:, i] # max-F1 precision, recall, F1 values
p, r, f1 = (r * nt).round() # true positives
tp = (tp / (p + eps) - tp).round() # false positives
fp # return tp, fp, p, r, ap, f1, unique_classes.astype(int), p_curve, r_curve, f1_curve, x, prec_values
return 1
True positives are decided using IOU
class 1 p = 1, r = 0.33 p = 0.5, r = 0.33 p = 0.66, r = 0.66
class 2 p = 1, r = 0.5 p = 0.5, r = 0.5
= np.array([1,0,1,1,0])
tp = np.array([1,1,1,1,1])
conf = np.array(['1','1','1','2','2'])
pred_cls = np.array(['1','1','1','2','2'])
target_cls =False,eps=1e-16) ap_per_class(tp, conf, pred_cls, target_cls, plot
recall [0.33333333 0.33333333 0.66666667]
precision [1. 0.5 0.66666667]
mpre [1. 1. 0.5 0.66666667 0. ]
np.flip [0. 0.66666667 0.5 1. 1. ]
np.accumulate.maximum [0. 0.66666667 0.66666667 1. 1. ]
AP 0.6672
recall [0.5 0.5]
precision [1. 0.5]
mpre [1. 1. 0.5 0. ]
np.flip [0. 0.5 1. 1. ]
np.accumulate.maximum [0. 0.5 1. 1. ]
AP 0.6224999999999999
1