Skip to content

Commit

Permalink
dbstream: fix adjacency matrix building (#1340)
Browse files Browse the repository at this point in the history
* dbstream: fix adj matrix building

* Update unreleased.md

* Update docs/releases/unreleased.md

Co-authored-by: Max Halford <[email protected]>

---------

Co-authored-by: Max Halford <[email protected]>
  • Loading branch information
donny741 and MaxHalford authored Oct 15, 2023
1 parent bfb4ea6 commit 424cc38
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 42 deletions.
1 change: 1 addition & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ River's mini-batch methods now support pandas v2. In particular, River conforms
- `cluster_is_up_to_date` is set to `True` at the end of the `self._recluster()` function.
- Shared density graph update timestamps are initialized with the current timestamp value
- `neighbour_neighbours` are appended correctly to the `seed_set` when generating cluster labels
- When building weighted adjacency matrix the algorithm accounts for possibly orphaned entries in shared density graph

## datasets

Expand Down
29 changes: 17 additions & 12 deletions river/cluster/dbstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,18 +287,23 @@ def _generate_weighted_adjacency_matrix(self):
weighted_adjacency_matrix = {}
for i in list(self.s.keys()):
for j in list(self.s[i].keys()):
if (
self._micro_clusters[i].weight >= self.minimum_weight
and self._micro_clusters[j].weight >= self.minimum_weight
):
value = self.s[i][j] / (
(self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2
)
if value > self.intersection_factor:
try:
weighted_adjacency_matrix[i][j] = value
except KeyError:
weighted_adjacency_matrix[i] = {j: value}
try:
if (
self._micro_clusters[i].weight <= self.minimum_weight
or self._micro_clusters[j].weight <= self.minimum_weight
):
continue
except KeyError:
continue

value = self.s[i][j] / (
(self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2
)
if value > self.intersection_factor:
try:
weighted_adjacency_matrix[i][j] = value
except KeyError:
weighted_adjacency_matrix[i] = {j: value}

return weighted_adjacency_matrix

Expand Down
85 changes: 55 additions & 30 deletions river/cluster/test_dbstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,30 @@
from river.cluster import DBSTREAM


@pytest.fixture
def dbstream():
def build_dbstream(fading_factor=0.001, intersection_factor=0.05):
return DBSTREAM(
fading_factor=0.001, clustering_threshold=1, cleanup_interval=1, intersection_factor=0.05
fading_factor=fading_factor,
clustering_threshold=1,
cleanup_interval=1,
intersection_factor=intersection_factor,
)


def add_cluster(dbstream, initial_point, move_towards, times=1):
dbstream.learn_one(initial_point)
for _ in range(times):
dbstream.learn_one(move_towards)


def assert_micro_cluster_properties(cluster, center, last_update=None):
assert cluster.center == pytest.approx(center)
if last_update is not None:
assert cluster.last_update == last_update


def test_cluster_formation_and_cleanup(dbstream: DBSTREAM):
def test_cluster_formation_and_cleanup():
dbstream = build_dbstream()

X = [
{1: 1},
{1: 3},
Expand All @@ -44,18 +54,12 @@ def test_cluster_formation_and_cleanup(dbstream: DBSTREAM):
assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12)


def test_with_two_micro_clusters(dbstream: DBSTREAM):
# First micro-cluster
dbstream.learn_one({1: 1, 2: 1})
for _ in range(25):
dbstream.learn_one({1: 1.7, 2: 1.7})

# Second micro-cluster
dbstream.learn_one({1: 3, 2: 3})
for _ in range(25):
dbstream.learn_one({1: 2.3, 2: 2.3})
def test_with_two_micro_clusters():
dbstream = build_dbstream()

# Points in the middle of two micro-clusters
add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
# Points in the middle of first and second micro-clusters
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

Expand All @@ -75,26 +79,16 @@ def test_with_two_micro_clusters(dbstream: DBSTREAM):
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033})


def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM):
# First micro-cluster
dbstream.learn_one({1: 1, 2: 1})
for _ in range(25):
dbstream.learn_one({1: 1.7, 2: 1.7})

# Second micro-cluster
dbstream.learn_one({1: 3, 2: 3})
for _ in range(25):
dbstream.learn_one({1: 2.3, 2: 2.3})
def test_density_graph_with_three_micro_clusters():
dbstream = build_dbstream()

add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
# Points in the middle of first and second micro-clusters
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

# Third micro-cluster
dbstream.learn_one({1: 4, 2: 4})
for _ in range(25):
dbstream.learn_one({1: 3.3, 2: 3.3})

add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
# Points in the middle of second and third micro-clusters
for _ in range(4):
dbstream.learn_one({1: 3, 2: 3})
Expand All @@ -118,3 +112,34 @@ def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM):
dbstream._recluster()
assert len(dbstream.clusters) == 1
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894})


def test_density_graph_with_removed_microcluster():
dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3)

add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
# Points in the middle of first and second micro-clusters
for _ in range(5):
dbstream.learn_one({1: 2, 2: 2})

add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
# Points in the middle of second and third micro-clusters
for _ in range(4):
dbstream.learn_one({1: 3, 2: 3})

assert len(dbstream._micro_clusters) == 2
assert_micro_cluster_properties(
dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86
)
assert_micro_cluster_properties(
dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86
)

assert dbstream.s[0] == pytest.approx({1: 3.615835})
assert dbstream.s[1] == pytest.approx({2: 2.803583})
assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}}

dbstream._recluster()
assert len(dbstream.clusters) == 1
assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231})

0 comments on commit 424cc38

Please sign in to comment.