Skip to content

Commit 28e43e7

Browse files
committed
Add Pearson correlation coefficient algorithm for data analysis
1 parent 02680c9 commit 28e43e7

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Pearson Correlation Coefficient: Measures the linear relationship between two
3+
variables. The result is a value between -1 and 1, where:
4+
1 = perfect positive correlation
5+
0 = no correlation
6+
-1 = perfect negative correlation
7+
8+
It is widely used in data analysis, statistics, and machine learning to
9+
understand relationships between features in a dataset.
10+
11+
Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
12+
"""
13+
14+
15+
def pearson_correlation(x: list[float], y: list[float]) -> float:
16+
"""
17+
Calculate the Pearson Correlation Coefficient between two lists.
18+
19+
Parameters
20+
----------
21+
x: list[float], first list of numbers
22+
y: list[float], second list of numbers
23+
24+
Returns
25+
-------
26+
float: Pearson correlation coefficient between -1 and 1
27+
28+
>>> pearson_correlation([1, 2, 3, 4, 5], [1, 2, 3, 4, 5])
29+
1.0
30+
>>> pearson_correlation([1, 2, 3, 4, 5], [5, 4, 3, 2, 1])
31+
-1.0
32+
>>> pearson_correlation([1, 2, 3], [4, 5, 6])
33+
1.0
34+
>>> round(pearson_correlation([1, 2, 3, 4], [1, 2, 1, 2]), 4)
35+
0.4472
36+
>>> pearson_correlation([], [1, 2, 3])
37+
Traceback (most recent call last):
38+
...
39+
ValueError: lists must not be empty
40+
>>> pearson_correlation([1, 2, 3], [1, 2])
41+
Traceback (most recent call last):
42+
...
43+
ValueError: lists must have the same length
44+
>>> pearson_correlation([1, 1, 1], [2, 2, 2])
45+
Traceback (most recent call last):
46+
...
47+
ValueError: standard deviation of x or y is zero
48+
"""
49+
if not x or not y:
50+
raise ValueError("lists must not be empty")
51+
if len(x) != len(y):
52+
raise ValueError("lists must have the same length")
53+
54+
n = len(x)
55+
mean_x = sum(x) / n
56+
mean_y = sum(y) / n
57+
58+
numerator = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n))
59+
std_x = sum((xi - mean_x) ** 2 for xi in x) ** 0.5
60+
std_y = sum((yi - mean_y) ** 2 for yi in y) ** 0.5
61+
62+
if std_x == 0 or std_y == 0:
63+
raise ValueError("standard deviation of x or y is zero")
64+
65+
return round(numerator / (std_x * std_y), 10)
66+
67+
68+
if __name__ == "__main__":
69+
import doctest
70+
71+
doctest.testmod()
72+
73+
x = [1, 2, 3, 4, 5]
74+
y = [2, 4, 5, 4, 5]
75+
print(f"x: {x}")
76+
print(f"y: {y}")
77+
print(f"Pearson correlation: {pearson_correlation(x, y)}")

0 commit comments

Comments
 (0)