From 6bf81df1d593df07cc29838e61265414b6bf8958 Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Tue, 5 Nov 2024 14:49:32 +0100 Subject: [PATCH 01/16] v1test implemented the rls, no tests yet --- river/linear_model/__init__.py | 2 ++ river/linear_model/rls.py | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 river/linear_model/rls.py diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 756720490a..e74d0d1439 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -10,6 +10,7 @@ from .pa import PAClassifier, PARegressor from .perceptron import Perceptron from .softmax import SoftmaxRegression +from .rls import RLS __all__ = [ "base", @@ -21,4 +22,5 @@ "PARegressor", "Perceptron", "SoftmaxRegression", + "RLS", ] diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py new file mode 100644 index 0000000000..71e189b0b3 --- /dev/null +++ b/river/linear_model/rls.py @@ -0,0 +1,46 @@ +import numpy as np + +class RLS(object): + + def __init__(self, p: int, l: float, delta: float): + self.p = p + self.l = l + self.delta = delta + + self.currentStep = 0 + + self.x = np.zeros((p + 1, 1)) # Column vector + self.P = np.identity(p + 1) * self.delta + + self.estimates = [] + self.estimates.append(np.zeros((p + 1, 1))) # Weight vector initialized to zeros + + self.Pks = [] + self.Pks.append(self.P) + + def estimate(self, xn: float, dn: float): + # Update input vector + self.x = np.roll(self.x, -1) + self.x[-1, 0] = xn + + # Get previous weight vector + wn_prev = self.estimates[-1] + + # Compute gain vector + denominator = self.l + self.x.T @ self.Pks[-1] @ self.x + gn = (self.Pks[-1] @ self.x) / denominator + + # Compute a priori error + alpha = dn - (self.x.T @ wn_prev) + + # Update inverse correlation matrix + Pn = (self.Pks[-1] - gn @ self.x.T @ self.Pks[-1]) / self.l + self.Pks.append(Pn) + + # Update weight vector + wn = wn_prev + gn * alpha + self.estimates.append(wn) + + self.currentStep += 1 + + return wn From 6aa486973fda441ef55df36389d34adeb71d02fc Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Tue, 5 Nov 2024 14:54:51 +0100 Subject: [PATCH 02/16] Update rls.py comments --- river/linear_model/rls.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py index 71e189b0b3..9aa69410e1 100644 --- a/river/linear_model/rls.py +++ b/river/linear_model/rls.py @@ -1,11 +1,12 @@ import numpy as np + class RLS(object): def __init__(self, p: int, l: float, delta: float): - self.p = p - self.l = l - self.delta = delta + self.p = p # Filter order + self.l = l # Forgetting factor + self.delta = delta # Value to initialise P(0) self.currentStep = 0 From 814fd8a9ceadb5be3933c9f0f510161d4818f01a Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:19:07 +0100 Subject: [PATCH 03/16] Added an v0 adpredictor --- river/base/Adpredictor.py | 73 ++++++++++++ river/base/Adpredictor_test.ipynb | 189 ++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 river/base/Adpredictor.py create mode 100644 river/base/Adpredictor_test.ipynb diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py new file mode 100644 index 0000000000..f58d326d14 --- /dev/null +++ b/river/base/Adpredictor.py @@ -0,0 +1,73 @@ +import numpy as np +from river import base +from river import optim +from collections import defaultdict +import logging +from collections import namedtuple +from river import compose, preprocessing, metrics + +logger = logging.getLogger(__name__) + +class AdPredictor(base.Classifier): + config = namedtuple('config', ['beta', 'prior_probability', 'epsilon', 'num_features']) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): + self.beta = beta + self.prior_probability = prior_probability + self.epsilon = epsilon + self.num_features = num_features + self.weights = defaultdict(lambda: {'mean': 0.0, 'variance': 1.0}) + self.bias_weight = self.prior_bias_weight() + + def prior_bias_weight(self): + return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta + + def _active_mean_variance(self, features): + total_mean = sum(self.weights[f]['mean'] for f in features) + self.bias_weight + total_variance = sum(self.weights[f]['variance'] for f in features) + self.beta ** 2 + return total_mean, total_variance + + def predict_one(self, x): + features = x.keys() + total_mean, total_variance = self._active_mean_variance(features) + return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) + + def learn_one(self, x, y): + features = x.keys() + y = 1 if y else -1 # Map label to ±1 for binary classification + total_mean, total_variance = self._active_mean_variance(features) + v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) + + for feature in features: + mean = self.weights[feature]['mean'] + variance = self.weights[feature]['variance'] + + mean_delta = y * variance / np.sqrt(total_variance) * v + variance_multiplier = 1.0 - variance / total_variance * w + + # Update weight + self.weights[feature]['mean'] = mean + mean_delta + self.weights[feature]['variance'] = variance * variance_multiplier + + def gaussian_corrections(self, score): + """Compute Gaussian corrections for Bayesian update.""" + cdf = 1 / (1 + np.exp(-score)) + pdf = np.exp(-0.5 * score ** 2) / np.sqrt(2 * np.pi) + v = pdf / cdf + w = v * (v + score) + return v, w + + def _apply_dynamics(self, weight): + prior_variance = 1.0 + adjusted_variance = weight['variance'] * prior_variance / \ + ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight['variance']) + adjusted_mean = adjusted_variance * ( + (1.0 - self.epsilon) * weight['mean'] / weight['variance'] + + self.epsilon * 0 / prior_variance) + return {'mean': adjusted_mean, 'variance': adjusted_variance} + + def __str__(self): + return "AdPredictor" + + + + diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb new file mode 100644 index 0000000000..b0897c1d3c --- /dev/null +++ b/river/base/Adpredictor_test.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-11-09T15:04:34.707796Z", + "start_time": "2024-11-09T15:04:34.663651Z" + } + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:37.139365Z", + "start_time": "2024-11-09T15:04:37.097733Z" + } + }, + "cell_type": "code", + "source": [ + "import os\n", + "print(os.getcwd())" + ], + "id": "65e3111b76277fc5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:\\River\\riverIDLIB\\river\\base\n" + ] + } + ], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:38.751355Z", + "start_time": "2024-11-09T15:04:38.708257Z" + } + }, + "cell_type": "code", + "source": [ + "from river import datasets\n", + "from river import metrics\n", + "from river import preprocessing\n", + "from river import compose\n" + ], + "id": "3ffeadeef731f48e", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:45.986384Z", + "start_time": "2024-11-09T15:04:45.943819Z" + } + }, + "cell_type": "code", + "source": [ + "import importlib.util\n", + "\n", + "spec = importlib.util.spec_from_file_location(\"AdPredictor\", \"./AdPredictor.py\")\n", + "AdPredictor_module = importlib.util.module_from_spec(spec)\n", + "spec.loader.exec_module(AdPredictor_module)\n", + "\n", + "AdPredictor = AdPredictor_module.AdPredictor" + ], + "id": "e323aa048e864b33", + "outputs": [], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:49.810249Z", + "start_time": "2024-11-09T15:04:49.768766Z" + } + }, + "cell_type": "code", + "source": "phishing_data = datasets.Phishing()", + "id": "c862e33d656cb230", + "outputs": [], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:54.402788Z", + "start_time": "2024-11-09T15:04:54.358522Z" + } + }, + "cell_type": "code", + "source": [ + "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", + "metric = metrics.Accuracy()" + ], + "id": "293c681cca67e1f4", + "outputs": [], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:59.609037Z", + "start_time": "2024-11-09T15:04:59.569533Z" + } + }, + "cell_type": "code", + "source": [ + "model_pipeline = compose.Pipeline(\n", + " ('scale', preprocessing.StandardScaler()),\n", + " ('predictor',model)\n", + ")" + ], + "id": "c8fb1bb8ed902d80", + "outputs": [], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:03:47.107046Z", + "start_time": "2024-11-09T15:03:47.054657Z" + } + }, + "cell_type": "code", + "source": [ + "for x, y in phishing_data:\n", + " y_pred = model_pipeline.predict_one(x)\n", + " metric = metric.update(y, y_pred)\n", + " model_pipeline = model_pipeline.learn_one(x, y)\n", + " print(f'Prediction: {y_pred}, Metric: {metric}')\n", + " break " + ], + "id": "1dea7c542ab4ad84", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction: 0.5, Metric: None\n" + ] + } + ], + "execution_count": 8 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b30974791713c556c7c1afb194ac28fdb9bac4ed Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:22:11 +0100 Subject: [PATCH 04/16] Added an v0 adpredictor --- river/base/Adpredictor.py | 49 +++++++------ river/base/Adpredictor_test.ipynb | 113 +++++++++++++++--------------- 2 files changed, 82 insertions(+), 80 deletions(-) diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py index f58d326d14..4702760cf8 100644 --- a/river/base/Adpredictor.py +++ b/river/base/Adpredictor.py @@ -1,29 +1,32 @@ +from __future__ import annotations + +import logging +from collections import defaultdict, namedtuple + import numpy as np + from river import base -from river import optim -from collections import defaultdict -import logging -from collections import namedtuple -from river import compose, preprocessing, metrics logger = logging.getLogger(__name__) + class AdPredictor(base.Classifier): - config = namedtuple('config', ['beta', 'prior_probability', 'epsilon', 'num_features']) + config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): self.beta = beta self.prior_probability = prior_probability self.epsilon = epsilon self.num_features = num_features - self.weights = defaultdict(lambda: {'mean': 0.0, 'variance': 1.0}) + self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) self.bias_weight = self.prior_bias_weight() def prior_bias_weight(self): return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta def _active_mean_variance(self, features): - total_mean = sum(self.weights[f]['mean'] for f in features) + self.bias_weight - total_variance = sum(self.weights[f]['variance'] for f in features) + self.beta ** 2 + total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight + total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 return total_mean, total_variance def predict_one(self, x): @@ -38,36 +41,36 @@ def learn_one(self, x, y): v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) for feature in features: - mean = self.weights[feature]['mean'] - variance = self.weights[feature]['variance'] + mean = self.weights[feature]["mean"] + variance = self.weights[feature]["variance"] mean_delta = y * variance / np.sqrt(total_variance) * v variance_multiplier = 1.0 - variance / total_variance * w # Update weight - self.weights[feature]['mean'] = mean + mean_delta - self.weights[feature]['variance'] = variance * variance_multiplier + self.weights[feature]["mean"] = mean + mean_delta + self.weights[feature]["variance"] = variance * variance_multiplier def gaussian_corrections(self, score): """Compute Gaussian corrections for Bayesian update.""" cdf = 1 / (1 + np.exp(-score)) - pdf = np.exp(-0.5 * score ** 2) / np.sqrt(2 * np.pi) + pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) v = pdf / cdf w = v * (v + score) return v, w def _apply_dynamics(self, weight): prior_variance = 1.0 - adjusted_variance = weight['variance'] * prior_variance / \ - ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight['variance']) + adjusted_variance = ( + weight["variance"] + * prior_variance + / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) + ) adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight['mean'] / weight['variance'] + - self.epsilon * 0 / prior_variance) - return {'mean': adjusted_mean, 'variance': adjusted_variance} + (1.0 - self.epsilon) * weight["mean"] / weight["variance"] + + self.epsilon * 0 / prior_variance + ) + return {"mean": adjusted_mean, "variance": adjusted_variance} def __str__(self): return "AdPredictor" - - - - diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb index b0897c1d3c..5a6b4b4ab9 100644 --- a/river/base/Adpredictor_test.ipynb +++ b/river/base/Adpredictor_test.ipynb @@ -2,18 +2,15 @@ "cells": [ { "cell_type": "code", + "execution_count": 9, "id": "initial_id", "metadata": { - "collapsed": true, "ExecuteTime": { "end_time": "2024-11-09T15:04:34.707796Z", "start_time": "2024-11-09T15:04:34.663651Z" - } + }, + "collapsed": true }, - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n" - ], "outputs": [ { "name": "stdout", @@ -24,21 +21,23 @@ ] } ], - "execution_count": 9 + "source": [ + "from __future__ import annotations\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] }, { + "cell_type": "code", + "execution_count": 10, + "id": "65e3111b76277fc5", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:37.139365Z", "start_time": "2024-11-09T15:04:37.097733Z" } }, - "cell_type": "code", - "source": [ - "import os\n", - "print(os.getcwd())" - ], - "id": "65e3111b76277fc5", "outputs": [ { "name": "stdout", @@ -48,34 +47,38 @@ ] } ], - "execution_count": 10 + "source": [ + "import os\n", + "\n", + "print(os.getcwd())" + ] }, { + "cell_type": "code", + "execution_count": 11, + "id": "3ffeadeef731f48e", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:38.751355Z", "start_time": "2024-11-09T15:04:38.708257Z" } }, - "cell_type": "code", - "source": [ - "from river import datasets\n", - "from river import metrics\n", - "from river import preprocessing\n", - "from river import compose\n" - ], - "id": "3ffeadeef731f48e", "outputs": [], - "execution_count": 11 + "source": [ + "from river import compose, datasets, metrics, preprocessing" + ] }, { + "cell_type": "code", + "execution_count": 12, + "id": "e323aa048e864b33", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:45.986384Z", "start_time": "2024-11-09T15:04:45.943819Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "import importlib.util\n", "\n", @@ -84,75 +87,64 @@ "spec.loader.exec_module(AdPredictor_module)\n", "\n", "AdPredictor = AdPredictor_module.AdPredictor" - ], - "id": "e323aa048e864b33", - "outputs": [], - "execution_count": 12 + ] }, { + "cell_type": "code", + "execution_count": 13, + "id": "c862e33d656cb230", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:49.810249Z", "start_time": "2024-11-09T15:04:49.768766Z" } }, - "cell_type": "code", - "source": "phishing_data = datasets.Phishing()", - "id": "c862e33d656cb230", "outputs": [], - "execution_count": 13 + "source": [ + "phishing_data = datasets.Phishing()" + ] }, { + "cell_type": "code", + "execution_count": 14, + "id": "293c681cca67e1f4", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:54.402788Z", "start_time": "2024-11-09T15:04:54.358522Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", "metric = metrics.Accuracy()" - ], - "id": "293c681cca67e1f4", - "outputs": [], - "execution_count": 14 + ] }, { + "cell_type": "code", + "execution_count": 15, + "id": "c8fb1bb8ed902d80", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:59.609037Z", "start_time": "2024-11-09T15:04:59.569533Z" } }, - "cell_type": "code", - "source": [ - "model_pipeline = compose.Pipeline(\n", - " ('scale', preprocessing.StandardScaler()),\n", - " ('predictor',model)\n", - ")" - ], - "id": "c8fb1bb8ed902d80", "outputs": [], - "execution_count": 15 + "source": [ + "model_pipeline = compose.Pipeline((\"scale\", preprocessing.StandardScaler()), (\"predictor\", model))" + ] }, { + "cell_type": "code", + "execution_count": 8, + "id": "1dea7c542ab4ad84", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:03:47.107046Z", "start_time": "2024-11-09T15:03:47.054657Z" } }, - "cell_type": "code", - "source": [ - "for x, y in phishing_data:\n", - " y_pred = model_pipeline.predict_one(x)\n", - " metric = metric.update(y, y_pred)\n", - " model_pipeline = model_pipeline.learn_one(x, y)\n", - " print(f'Prediction: {y_pred}, Metric: {metric}')\n", - " break " - ], - "id": "1dea7c542ab4ad84", "outputs": [ { "name": "stdout", @@ -162,7 +154,14 @@ ] } ], - "execution_count": 8 + "source": [ + "for x, y in phishing_data:\n", + " y_pred = model_pipeline.predict_one(x)\n", + " metric = metric.update(y, y_pred)\n", + " model_pipeline = model_pipeline.learn_one(x, y)\n", + " print(f\"Prediction: {y_pred}, Metric: {metric}\")\n", + " break" + ] } ], "metadata": { From 6a229d8cf8433f7b1f30abd2dea30fc56f28fc23 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:22:40 +0100 Subject: [PATCH 05/16] adpredictor algorithm --- river/linear_model/adpredictor.py | 156 ++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 river/linear_model/adpredictor.py diff --git a/river/linear_model/adpredictor.py b/river/linear_model/adpredictor.py new file mode 100644 index 0000000000..6af80ef810 --- /dev/null +++ b/river/linear_model/adpredictor.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +from collections import defaultdict, namedtuple + +import numpy as np + +from river.base import Classifier + + +class AdPredictor(Classifier): + """AdPredictor, developed by Microsoft, is a machine learning algorithm designed to predict the probability of user + clicks on online advertisements. This algorithm plays a crucial role in computational advertising, where predicting + click-through rates (CTR) is essential for optimizing ad placements and maximizing revenue. + Parameters + ---------- + beta (float, default=0.1): + A smoothing parameter that regulates the weight updates. Smaller values allow for finer updates, + while larger values can accelerate convergence but may risk instability. + prior_probability (float, default=0.5): + The initial estimate rate. This value sets the bias weight, influencing the model's predictions + before observing any data. + + epsilon (float, default=0.1): + A variance dynamics parameter that controls how the model balances prior knowledge and learned information. + Larger values prioritize prior knowledge, while smaller values favor data-driven updates. + + num_features (int, default=10): + The maximum number of features the model can handle. This parameter affects scalability and efficiency, + especially for high-dimensional data. + + Attributes + ---------- + weights (defaultdict): + A dictionary where each feature key maps to a dictionary containing: + + mean (float): The current estimate of the feature's weight. + variance (float): The uncertainty associated with the weight estimate. + + bias_weight (float): + The weight corresponding to the model bias, initialized using the prior_probability. + This attribute allows the model to make predictions even when no features are active. + + Examples: + ---------- + >>> adpredictor = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=5) + >>> data = [ + ({"feature1": 1, "feature2": 1}, 1), + ({"feature1": 1, "feature3": 1}, 0), + ({"feature2": 1, "feature4": 1}, 1), + ({"feature1": 1, "feature2": 1, "feature3": 1}, 0), + ({"feature4": 1, "feature5": 1}, 1), + ] + >>> def train_and_test(model, data): + for x, y in data: + pred_before = model.predict_one(x) + model.learn_one(x, y) + pred_after = model.predict_one(x) + print(f"Features: {x} | True label: {y} | Prediction before training: {pred_before:.4f} | Prediction after training: {pred_after:.4f}") + + >>> train_and_test(adpredictor, data) + + Features: {'feature1': 1, 'feature2': 1} | True label: 1 | Prediction before training: 0.5000 | Prediction after training: 0.7230 + Features: {'feature1': 1, 'feature3': 1} | True label: 0 | Prediction before training: 0.6065 | Prediction after training: 0.3650 + Features: {'feature2': 1, 'feature4': 1} | True label: 1 | Prediction before training: 0.6065 | Prediction after training: 0.7761 + Features: {'feature1': 1, 'feature2': 1, 'feature3': 1} | True label: 0 | Prediction before training: 0.5455 | Prediction after training: 0.3197 + Features: {'feature4': 1, 'feature5': 1} | True label: 1 | Prediction before training: 0.5888 | Prediction after training: 0.7699 + + """ + + config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) + + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): + # Initialization of model parameters + self.beta = beta + self.prior_probability = prior_probability + self.epsilon = epsilon + self.num_features = num_features + # Initialize weights as a defaultdict for each feature, with mean and variance attributes + self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) + # Initialize bias weight based on prior probability + self.bias_weight = self.prior_bias_weight() + + def prior_bias_weight(self): + # Calculate initial bias weight using prior probability + + return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta + + def _active_mean_variance(self, features): + """_active_mean_variance(features) (method): + Computes the cumulative mean and variance for all active features in a sample, + including the bias. This is crucial for making predictions.""" + # Calculate total mean and variance for all active features + + total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight + total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 + return total_mean, total_variance + + def predict_one(self, x): + # Generate a probability prediction for one sample + features = x.keys() + total_mean, total_variance = self._active_mean_variance(features) + # Sigmoid function for probability prediction based on Gaussian distribution + return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) + + def learn_one(self, x, y): + # Online learning step to update the model with one sample + features = x.keys() + y = 1 if y else -1 + total_mean, total_variance = self._active_mean_variance(features) + v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) + + # Update mean and variance for each feature in the sample + for feature in features: + mean = self.weights[feature]["mean"] + variance = self.weights[feature]["variance"] + + mean_delta = y * variance / np.sqrt(total_variance) * v # Update mean + variance_multiplier = 1.0 - variance / total_variance * w # Update variance + + # Update weight + self.weights[feature]["mean"] = mean + mean_delta + self.weights[feature]["variance"] = variance * variance_multiplier + + def gaussian_corrections(self, score): + """gaussian_corrections(score) (method): + Implements Bayesian update corrections using the Gaussian probability density function (PDF) + and cumulative density function (CDF).""" + # CDF calculation for Gaussian correction + cdf = 1 / (1 + np.exp(-score)) + pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) # PDF calculation + v = pdf / cdf # Correction factor for mean update + w = v * (v + score) # Correction factor for variance update + return v, w + + def _apply_dynamics(self, weight): + """_apply_dynamics(weight) (method): + Regularizes the variance of a feature weight using a combination of prior variance and learned variance. + This helps maintain a balance between prior beliefs and observed data.""" + # Apply variance dynamics for regularization + prior_variance = 1.0 + # Adjust variance to manage prior knowledge and current learning balance + adjusted_variance = ( + weight["variance"] + * prior_variance + / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) + ) + # Adjust mean based on the dynamics, balancing previous and current knowledge + adjusted_mean = adjusted_variance * ( + (1.0 - self.epsilon) * weight["mean"] / weight["variance"] + + self.epsilon * 0 / prior_variance + ) + return {"mean": adjusted_mean, "variance": adjusted_variance} + + def __str__(self): + # String representation of the model for easy identification + return "AdPredictor" From e0e6c7570edafb22c5f9041023e2c3c6d7d56ec2 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:33:38 +0100 Subject: [PATCH 06/16] add adpredictor --- river/linear_model/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index e74d0d1439..eb31add391 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -11,6 +11,7 @@ from .perceptron import Perceptron from .softmax import SoftmaxRegression from .rls import RLS +from .adpredictor import AdPredictor __all__ = [ "base", @@ -23,4 +24,5 @@ "Perceptron", "SoftmaxRegression", "RLS", + "AdPredictor", ] From ff8c6174546eb9491b4040ea9b548a063aa05084 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:46:25 +0100 Subject: [PATCH 07/16] added an adpredictor function --- river/base/Adpredictor.py | 76 -------- river/base/Adpredictor_test.ipynb | 188 -------------------- river/base/__init__.py | 2 + river/{linear_model => base}/adpredictor.py | 0 4 files changed, 2 insertions(+), 264 deletions(-) delete mode 100644 river/base/Adpredictor.py delete mode 100644 river/base/Adpredictor_test.ipynb rename river/{linear_model => base}/adpredictor.py (100%) diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py deleted file mode 100644 index 4702760cf8..0000000000 --- a/river/base/Adpredictor.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import annotations - -import logging -from collections import defaultdict, namedtuple - -import numpy as np - -from river import base - -logger = logging.getLogger(__name__) - - -class AdPredictor(base.Classifier): - config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) - - def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): - self.beta = beta - self.prior_probability = prior_probability - self.epsilon = epsilon - self.num_features = num_features - self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) - self.bias_weight = self.prior_bias_weight() - - def prior_bias_weight(self): - return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta - - def _active_mean_variance(self, features): - total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight - total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 - return total_mean, total_variance - - def predict_one(self, x): - features = x.keys() - total_mean, total_variance = self._active_mean_variance(features) - return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) - - def learn_one(self, x, y): - features = x.keys() - y = 1 if y else -1 # Map label to ±1 for binary classification - total_mean, total_variance = self._active_mean_variance(features) - v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) - - for feature in features: - mean = self.weights[feature]["mean"] - variance = self.weights[feature]["variance"] - - mean_delta = y * variance / np.sqrt(total_variance) * v - variance_multiplier = 1.0 - variance / total_variance * w - - # Update weight - self.weights[feature]["mean"] = mean + mean_delta - self.weights[feature]["variance"] = variance * variance_multiplier - - def gaussian_corrections(self, score): - """Compute Gaussian corrections for Bayesian update.""" - cdf = 1 / (1 + np.exp(-score)) - pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) - v = pdf / cdf - w = v * (v + score) - return v, w - - def _apply_dynamics(self, weight): - prior_variance = 1.0 - adjusted_variance = ( - weight["variance"] - * prior_variance - / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) - ) - adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight["mean"] / weight["variance"] - + self.epsilon * 0 / prior_variance - ) - return {"mean": adjusted_mean, "variance": adjusted_variance} - - def __str__(self): - return "AdPredictor" diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb deleted file mode 100644 index 5a6b4b4ab9..0000000000 --- a/river/base/Adpredictor_test.ipynb +++ /dev/null @@ -1,188 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 9, - "id": "initial_id", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:34.707796Z", - "start_time": "2024-11-09T15:04:34.663651Z" - }, - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "from __future__ import annotations\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "65e3111b76277fc5", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:37.139365Z", - "start_time": "2024-11-09T15:04:37.097733Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C:\\River\\riverIDLIB\\river\\base\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3ffeadeef731f48e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:38.751355Z", - "start_time": "2024-11-09T15:04:38.708257Z" - } - }, - "outputs": [], - "source": [ - "from river import compose, datasets, metrics, preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e323aa048e864b33", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:45.986384Z", - "start_time": "2024-11-09T15:04:45.943819Z" - } - }, - "outputs": [], - "source": [ - "import importlib.util\n", - "\n", - "spec = importlib.util.spec_from_file_location(\"AdPredictor\", \"./AdPredictor.py\")\n", - "AdPredictor_module = importlib.util.module_from_spec(spec)\n", - "spec.loader.exec_module(AdPredictor_module)\n", - "\n", - "AdPredictor = AdPredictor_module.AdPredictor" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c862e33d656cb230", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:49.810249Z", - "start_time": "2024-11-09T15:04:49.768766Z" - } - }, - "outputs": [], - "source": [ - "phishing_data = datasets.Phishing()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "293c681cca67e1f4", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:54.402788Z", - "start_time": "2024-11-09T15:04:54.358522Z" - } - }, - "outputs": [], - "source": [ - "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", - "metric = metrics.Accuracy()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c8fb1bb8ed902d80", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:59.609037Z", - "start_time": "2024-11-09T15:04:59.569533Z" - } - }, - "outputs": [], - "source": [ - "model_pipeline = compose.Pipeline((\"scale\", preprocessing.StandardScaler()), (\"predictor\", model))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1dea7c542ab4ad84", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:03:47.107046Z", - "start_time": "2024-11-09T15:03:47.054657Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction: 0.5, Metric: None\n" - ] - } - ], - "source": [ - "for x, y in phishing_data:\n", - " y_pred = model_pipeline.predict_one(x)\n", - " metric = metric.update(y, y_pred)\n", - " model_pipeline = model_pipeline.learn_one(x, y)\n", - " print(f\"Prediction: {y_pred}, Metric: {metric}\")\n", - " break" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/river/base/__init__.py b/river/base/__init__.py index 0aaa521934..2d6f2c2c97 100644 --- a/river/base/__init__.py +++ b/river/base/__init__.py @@ -35,8 +35,10 @@ Transformer, ) from .wrapper import Wrapper +from .adpredictor import AdPredictor __all__ = [ + "AdPredictor", "Base", "BinaryDriftDetector", "BinaryDriftAndWarningDetector", diff --git a/river/linear_model/adpredictor.py b/river/base/adpredictor.py similarity index 100% rename from river/linear_model/adpredictor.py rename to river/base/adpredictor.py From 67e7e14de20ee307e615c68cd7560d76fe669b70 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:47:24 +0100 Subject: [PATCH 08/16] remooved adpredictor here --- river/linear_model/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index eb31add391..4ecd736125 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -11,7 +11,7 @@ from .perceptron import Perceptron from .softmax import SoftmaxRegression from .rls import RLS -from .adpredictor import AdPredictor + __all__ = [ "base", @@ -24,5 +24,5 @@ "Perceptron", "SoftmaxRegression", "RLS", - "AdPredictor", + ] From 6f43ec8a5ddc490054867429b2e8684cca652015 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 01:11:33 +0100 Subject: [PATCH 09/16] fixed bugs --- river/base/adpredictor.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/river/base/adpredictor.py b/river/base/adpredictor.py index 6af80ef810..3fd35c3f1d 100644 --- a/river/base/adpredictor.py +++ b/river/base/adpredictor.py @@ -7,6 +7,9 @@ from river.base import Classifier +def default_weight(): + return {"mean": 0.0, "variance": 1.0} + class AdPredictor(Classifier): """AdPredictor, developed by Microsoft, is a machine learning algorithm designed to predict the probability of user clicks on online advertisements. This algorithm plays a crucial role in computational advertising, where predicting @@ -42,22 +45,16 @@ class AdPredictor(Classifier): Examples: ---------- - >>> adpredictor = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=5) - >>> data = [ - ({"feature1": 1, "feature2": 1}, 1), - ({"feature1": 1, "feature3": 1}, 0), - ({"feature2": 1, "feature4": 1}, 1), - ({"feature1": 1, "feature2": 1, "feature3": 1}, 0), - ({"feature4": 1, "feature5": 1}, 1), - ] - >>> def train_and_test(model, data): + adpredictor = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=5) + data = [({"feature1": 1, "feature2": 1}, 1),({"feature1": 1, "feature3": 1}, 0),({"feature2": 1, "feature4": 1}, 1),({"feature1": 1, "feature2": 1, "feature3": 1}, 0),({"feature4": 1, "feature5": 1}, 1),] + def train_and_test(model, data): for x, y in data: - pred_before = model.predict_one(x) + pred_before = model.predict_one(x) model.learn_one(x, y) pred_after = model.predict_one(x) print(f"Features: {x} | True label: {y} | Prediction before training: {pred_before:.4f} | Prediction after training: {pred_after:.4f}") - >>> train_and_test(adpredictor, data) + train_and_test(adpredictor, data) Features: {'feature1': 1, 'feature2': 1} | True label: 1 | Prediction before training: 0.5000 | Prediction after training: 0.7230 Features: {'feature1': 1, 'feature3': 1} | True label: 0 | Prediction before training: 0.6065 | Prediction after training: 0.3650 @@ -69,6 +66,7 @@ class AdPredictor(Classifier): config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): # Initialization of model parameters self.beta = beta @@ -76,8 +74,11 @@ def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10 self.epsilon = epsilon self.num_features = num_features # Initialize weights as a defaultdict for each feature, with mean and variance attributes - self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) + #self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) # Initialize bias weight based on prior probability + + + self.weights = defaultdict(default_weight) self.bias_weight = self.prior_bias_weight() def prior_bias_weight(self): From 89cd67e5703d980738fa08b0470a45af0e4d1223 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 01:13:02 +0100 Subject: [PATCH 10/16] fixed bugs --- river/base/adpredictor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/river/base/adpredictor.py b/river/base/adpredictor.py index 3fd35c3f1d..bcd5deb49f 100644 --- a/river/base/adpredictor.py +++ b/river/base/adpredictor.py @@ -74,11 +74,8 @@ def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10 self.epsilon = epsilon self.num_features = num_features # Initialize weights as a defaultdict for each feature, with mean and variance attributes - #self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) - # Initialize bias weight based on prior probability - - self.weights = defaultdict(default_weight) + # Initialize bias weight based on prior probability self.bias_weight = self.prior_bias_weight() def prior_bias_weight(self): From 54a94e3ea1d0e549403a1f207f4e68661d528378 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sun, 17 Nov 2024 01:20:35 +0100 Subject: [PATCH 11/16] removed rls --- river/linear_model/__init__.py | 4 +-- river/linear_model/rls.py | 47 ---------------------------------- 2 files changed, 2 insertions(+), 49 deletions(-) delete mode 100644 river/linear_model/rls.py diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 4ecd736125..33d6cfa05e 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -10,7 +10,7 @@ from .pa import PAClassifier, PARegressor from .perceptron import Perceptron from .softmax import SoftmaxRegression -from .rls import RLS + __all__ = [ @@ -23,6 +23,6 @@ "PARegressor", "Perceptron", "SoftmaxRegression", - "RLS", + ] diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py deleted file mode 100644 index 9aa69410e1..0000000000 --- a/river/linear_model/rls.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np - - -class RLS(object): - - def __init__(self, p: int, l: float, delta: float): - self.p = p # Filter order - self.l = l # Forgetting factor - self.delta = delta # Value to initialise P(0) - - self.currentStep = 0 - - self.x = np.zeros((p + 1, 1)) # Column vector - self.P = np.identity(p + 1) * self.delta - - self.estimates = [] - self.estimates.append(np.zeros((p + 1, 1))) # Weight vector initialized to zeros - - self.Pks = [] - self.Pks.append(self.P) - - def estimate(self, xn: float, dn: float): - # Update input vector - self.x = np.roll(self.x, -1) - self.x[-1, 0] = xn - - # Get previous weight vector - wn_prev = self.estimates[-1] - - # Compute gain vector - denominator = self.l + self.x.T @ self.Pks[-1] @ self.x - gn = (self.Pks[-1] @ self.x) / denominator - - # Compute a priori error - alpha = dn - (self.x.T @ wn_prev) - - # Update inverse correlation matrix - Pn = (self.Pks[-1] - gn @ self.x.T @ self.Pks[-1]) / self.l - self.Pks.append(Pn) - - # Update weight vector - wn = wn_prev + gn * alpha - self.estimates.append(wn) - - self.currentStep += 1 - - return wn From 4a7bc49c2b95b4021c3854283dadc2f19c58336e Mon Sep 17 00:00:00 2001 From: s23lachg Date: Sun, 17 Nov 2024 01:27:16 +0100 Subject: [PATCH 12/16] Fix test pre commit --- river/base/__init__.py | 2 +- river/base/adpredictor.py | 2 +- river/linear_model/__init__.py | 4 ---- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/river/base/__init__.py b/river/base/__init__.py index 2d6f2c2c97..a92e437b32 100644 --- a/river/base/__init__.py +++ b/river/base/__init__.py @@ -15,6 +15,7 @@ from __future__ import annotations from . import tags, typing +from .adpredictor import AdPredictor from .base import Base from .classifier import Classifier, MiniBatchClassifier from .clusterer import Clusterer @@ -35,7 +36,6 @@ Transformer, ) from .wrapper import Wrapper -from .adpredictor import AdPredictor __all__ = [ "AdPredictor", diff --git a/river/base/adpredictor.py b/river/base/adpredictor.py index bcd5deb49f..e95b581ffd 100644 --- a/river/base/adpredictor.py +++ b/river/base/adpredictor.py @@ -10,6 +10,7 @@ def default_weight(): return {"mean": 0.0, "variance": 1.0} + class AdPredictor(Classifier): """AdPredictor, developed by Microsoft, is a machine learning algorithm designed to predict the probability of user clicks on online advertisements. This algorithm plays a crucial role in computational advertising, where predicting @@ -66,7 +67,6 @@ def train_and_test(model, data): config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) - def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): # Initialization of model parameters self.beta = beta diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 33d6cfa05e..756720490a 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -11,8 +11,6 @@ from .perceptron import Perceptron from .softmax import SoftmaxRegression - - __all__ = [ "base", "ALMAClassifier", @@ -23,6 +21,4 @@ "PARegressor", "Perceptron", "SoftmaxRegression", - - ] From 7311788a181597751cf2880d7346b74c6cfebef0 Mon Sep 17 00:00:00 2001 From: s23lachg Date: Sun, 17 Nov 2024 01:48:05 +0100 Subject: [PATCH 13/16] Fixed imports --- river/base/adpredictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/river/base/adpredictor.py b/river/base/adpredictor.py index e95b581ffd..75c0b2aaf2 100644 --- a/river/base/adpredictor.py +++ b/river/base/adpredictor.py @@ -4,7 +4,7 @@ import numpy as np -from river.base import Classifier +from river.base.classifier import Classifier def default_weight(): From 648b1a4555b74edb946bd06f0ad5414d1b7da2e6 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Tue, 26 Nov 2024 19:47:05 +0100 Subject: [PATCH 14/16] adjusted the adpredictor algorithm --- river/{base => linear_model}/adpredictor.py | 87 +++++++++++---------- 1 file changed, 46 insertions(+), 41 deletions(-) rename river/{base => linear_model}/adpredictor.py (68%) diff --git a/river/base/adpredictor.py b/river/linear_model/adpredictor.py similarity index 68% rename from river/base/adpredictor.py rename to river/linear_model/adpredictor.py index e95b581ffd..8755b48504 100644 --- a/river/base/adpredictor.py +++ b/river/linear_model/adpredictor.py @@ -1,18 +1,20 @@ -from __future__ import annotations +import math -from collections import defaultdict, namedtuple - -import numpy as np +from collections import * from river.base import Classifier -def default_weight(): - return {"mean": 0.0, "variance": 1.0} +def default_mean(): + return 0.0 + +def default_variance(): + return 1.0 class AdPredictor(Classifier): - """AdPredictor, developed by Microsoft, is a machine learning algorithm designed to predict the probability of user + """ + AdPredictor is a machine learning algorithm designed to predict the probability of user clicks on online advertisements. This algorithm plays a crucial role in computational advertising, where predicting click-through rates (CTR) is essential for optimizing ad placements and maximizing revenue. Parameters @@ -46,17 +48,18 @@ class AdPredictor(Classifier): Examples: ---------- - adpredictor = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=5) - data = [({"feature1": 1, "feature2": 1}, 1),({"feature1": 1, "feature3": 1}, 0),({"feature2": 1, "feature4": 1}, 1),({"feature1": 1, "feature2": 1, "feature3": 1}, 0),({"feature4": 1, "feature5": 1}, 1),] - def train_and_test(model, data): - for x, y in data: - pred_before = model.predict_one(x) - model.learn_one(x, y) - pred_after = model.predict_one(x) - print(f"Features: {x} | True label: {y} | Prediction before training: {pred_before:.4f} | Prediction after training: {pred_after:.4f}") - - train_and_test(adpredictor, data) + >>> from river.linear_model import AdPredictor + >>> adpredictor = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=5) + >>> data = [({"feature1": 1, "feature2": 1}, 1),({"feature1": 1, "feature3": 1}, 0),({"feature2": 1, "feature4": 1}, 1),({"feature1": 1, "feature2": 1, "feature3": 1}, 0),({"feature4": 1, "feature5": 1}, 1),] + >>> def train_and_test(model, data): + ... for x, y in data: + ... pred_before = model.predict_one(x) + ... model.learn_one(x, y) + ... pred_after = model.predict_one(x) + ... print(f"Features: {x} | True label: {y} | Prediction before training: {pred_before:.4f} | Prediction after training: {pred_after:.4f}") + + >>> train_and_test(adpredictor, data) Features: {'feature1': 1, 'feature2': 1} | True label: 1 | Prediction before training: 0.5000 | Prediction after training: 0.7230 Features: {'feature1': 1, 'feature3': 1} | True label: 0 | Prediction before training: 0.6065 | Prediction after training: 0.3650 Features: {'feature2': 1, 'feature4': 1} | True label: 1 | Prediction before training: 0.6065 | Prediction after training: 0.7761 @@ -65,7 +68,7 @@ def train_and_test(model, data): """ - config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): # Initialization of model parameters @@ -74,14 +77,18 @@ def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10 self.epsilon = epsilon self.num_features = num_features # Initialize weights as a defaultdict for each feature, with mean and variance attributes - self.weights = defaultdict(default_weight) + + self.means = defaultdict(default_mean) + self.variances = defaultdict(default_variance) + + # Initialize bias weight based on prior probability self.bias_weight = self.prior_bias_weight() def prior_bias_weight(self): # Calculate initial bias weight using prior probability - return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta + return math.log(self.prior_probability / (1 - self.prior_probability)) / self.beta def _active_mean_variance(self, features): """_active_mean_variance(features) (method): @@ -89,8 +96,8 @@ def _active_mean_variance(self, features): including the bias. This is crucial for making predictions.""" # Calculate total mean and variance for all active features - total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight - total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 + total_mean = sum(self.means[f] for f in features) + self.bias_weight + total_variance = sum(self.variances[f] for f in features) + self.beta ** 2 return total_mean, total_variance def predict_one(self, x): @@ -98,34 +105,35 @@ def predict_one(self, x): features = x.keys() total_mean, total_variance = self._active_mean_variance(features) # Sigmoid function for probability prediction based on Gaussian distribution - return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) + return 1 / (1 + math.exp(-total_mean / math.sqrt(total_variance))) def learn_one(self, x, y): # Online learning step to update the model with one sample features = x.keys() y = 1 if y else -1 total_mean, total_variance = self._active_mean_variance(features) - v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) + v, w = self.gaussian_corrections(y * total_mean / math.sqrt(total_variance)) # Update mean and variance for each feature in the sample for feature in features: - mean = self.weights[feature]["mean"] - variance = self.weights[feature]["variance"] + mean = self.means[feature] + variance = self.variances[feature] - mean_delta = y * variance / np.sqrt(total_variance) * v # Update mean - variance_multiplier = 1.0 - variance / total_variance * w # Update variance + mean_delta = y * variance / math.sqrt(total_variance) * v # Update mean + variance_multiplier = 1.0 - variance / total_variance * w # Update variance # Update weight - self.weights[feature]["mean"] = mean + mean_delta - self.weights[feature]["variance"] = variance * variance_multiplier + self.means[feature] = mean + mean_delta + self.variances[feature]= variance * variance_multiplier + def gaussian_corrections(self, score): """gaussian_corrections(score) (method): Implements Bayesian update corrections using the Gaussian probability density function (PDF) and cumulative density function (CDF).""" # CDF calculation for Gaussian correction - cdf = 1 / (1 + np.exp(-score)) - pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) # PDF calculation + cdf = 1 / (1 + math.exp(-score)) + pdf = math.exp(-0.5 * score**2) / math.sqrt(2 * math.pi) # PDF calculation v = pdf / cdf # Correction factor for mean update w = v * (v + score) # Correction factor for variance update return v, w @@ -138,17 +146,14 @@ def _apply_dynamics(self, weight): prior_variance = 1.0 # Adjust variance to manage prior knowledge and current learning balance adjusted_variance = ( - weight["variance"] - * prior_variance - / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) - ) + weight["variance"] + * prior_variance + / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"])) # Adjust mean based on the dynamics, balancing previous and current knowledge adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight["mean"] / weight["variance"] - + self.epsilon * 0 / prior_variance + (1.0 - self.epsilon) * weight["mean"] / weight["variance"] + + self.epsilon * 0 / prior_variance ) return {"mean": adjusted_mean, "variance": adjusted_variance} - def __str__(self): - # String representation of the model for easy identification - return "AdPredictor" + From dcb0f98e9eabe4d022e0c96684496d247cf2339c Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Tue, 26 Nov 2024 19:49:45 +0100 Subject: [PATCH 15/16] updated the rest of the project --- river/base/__init__.py | 2 -- river/linear_model/__init__.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/river/base/__init__.py b/river/base/__init__.py index a92e437b32..0aaa521934 100644 --- a/river/base/__init__.py +++ b/river/base/__init__.py @@ -15,7 +15,6 @@ from __future__ import annotations from . import tags, typing -from .adpredictor import AdPredictor from .base import Base from .classifier import Classifier, MiniBatchClassifier from .clusterer import Clusterer @@ -38,7 +37,6 @@ from .wrapper import Wrapper __all__ = [ - "AdPredictor", "Base", "BinaryDriftDetector", "BinaryDriftAndWarningDetector", diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 756720490a..9eaedb1dda 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -3,6 +3,7 @@ from __future__ import annotations from . import base +from .adpredictor import AdPredictor from .alma import ALMAClassifier from .bayesian_lin_reg import BayesianLinearRegression from .lin_reg import LinearRegression @@ -21,4 +22,5 @@ "PARegressor", "Perceptron", "SoftmaxRegression", + "AdPredictor", ] From 1b82c24ebd749be55280f3f72562f38bfa2ece82 Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Tue, 26 Nov 2024 20:09:09 +0100 Subject: [PATCH 16/16] modified defaultdict --- river/linear_model/adpredictor.py | 35 ++++++++++++++----------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/river/linear_model/adpredictor.py b/river/linear_model/adpredictor.py index 8628a8a025..67811a9529 100644 --- a/river/linear_model/adpredictor.py +++ b/river/linear_model/adpredictor.py @@ -1,6 +1,7 @@ -import math +from __future__ import annotations -from collections import * +import collections +import math from river.base.classifier import Classifier @@ -8,6 +9,7 @@ def default_mean(): return 0.0 + def default_variance(): return 1.0 @@ -68,8 +70,6 @@ class AdPredictor(Classifier): """ - - def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): # Initialization of model parameters self.beta = beta @@ -78,9 +78,8 @@ def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10 self.num_features = num_features # Initialize weights as a defaultdict for each feature, with mean and variance attributes - self.means = defaultdict(default_mean) - self.variances = defaultdict(default_variance) - + self.means = collections.defaultdict(default_mean) + self.variances = collections.defaultdict(default_variance) # Initialize bias weight based on prior probability self.bias_weight = self.prior_bias_weight() @@ -97,7 +96,7 @@ def _active_mean_variance(self, features): # Calculate total mean and variance for all active features total_mean = sum(self.means[f] for f in features) + self.bias_weight - total_variance = sum(self.variances[f] for f in features) + self.beta ** 2 + total_variance = sum(self.variances[f] for f in features) + self.beta**2 return total_mean, total_variance def predict_one(self, x): @@ -119,13 +118,12 @@ def learn_one(self, x, y): mean = self.means[feature] variance = self.variances[feature] - mean_delta = y * variance / math.sqrt(total_variance) * v # Update mean - variance_multiplier = 1.0 - variance / total_variance * w # Update variance + mean_delta = y * variance / math.sqrt(total_variance) * v # Update mean + variance_multiplier = 1.0 - variance / total_variance * w # Update variance # Update weight self.means[feature] = mean + mean_delta - self.variances[feature]= variance * variance_multiplier - + self.variances[feature] = variance * variance_multiplier def gaussian_corrections(self, score): """gaussian_corrections(score) (method): @@ -146,14 +144,13 @@ def _apply_dynamics(self, weight): prior_variance = 1.0 # Adjust variance to manage prior knowledge and current learning balance adjusted_variance = ( - weight["variance"] - * prior_variance - / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"])) + weight["variance"] + * prior_variance + / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) + ) # Adjust mean based on the dynamics, balancing previous and current knowledge adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight["mean"] / weight["variance"] - + self.epsilon * 0 / prior_variance + (1.0 - self.epsilon) * weight["mean"] / weight["variance"] + + self.epsilon * 0 / prior_variance ) return {"mean": adjusted_mean, "variance": adjusted_variance} - -