Skip to content
Snippets Groups Projects
Commit b4323341 authored by Fanna Lautenbach's avatar Fanna Lautenbach
Browse files

fix n_bins for unique values and change formatting to 3 decimals

parent 97938437
No related branches found
No related tags found
1 merge request!50fix n_bins for unique values and change formatting to 3 decimals
Pipeline #41507 passed
...@@ -60,7 +60,7 @@ class ComputeInputsHistogram(unittest.TestCase): ...@@ -60,7 +60,7 @@ class ComputeInputsHistogram(unittest.TestCase):
self.assertEqual(53832140800, max_size) self.assertEqual(53832140800, max_size)
self.assertEqual(2, biggest_bucket) self.assertEqual(2, biggest_bucket)
self.assertListEqual([1, 2, 2, 0, 0, 1], counts) self.assertListEqual([1, 2, 2, 0, 0, 1], counts)
self.assertListEqual(['20.3GB', '25.3GB', '30.3GB', '35.2GB', '40.2GB', '45.2GB', '50.1GB'], bins) self.assertListEqual(['20.333GB', '25.300GB', '30.267GB', '35.234GB', '40.201GB', '45.168GB', '50.135GB'], bins)
def test_single_range(self): def test_single_range(self):
test_data = {"surls": [{"size": 1, "surl": "test"}, test_data = {"surls": [{"size": 1, "surl": "test"},
...@@ -77,7 +77,7 @@ class ComputeInputsHistogram(unittest.TestCase): ...@@ -77,7 +77,7 @@ class ComputeInputsHistogram(unittest.TestCase):
self.assertEqual(1, max_size) self.assertEqual(1, max_size)
self.assertEqual(6, biggest_bucket) self.assertEqual(6, biggest_bucket)
self.assertListEqual([6], counts) self.assertListEqual([6], counts)
self.assertListEqual(['0.5B', '1.5B'], bins) self.assertListEqual(['0.500B', '1.500B'], bins)
def test_extreme_wide_range(self): def test_extreme_wide_range(self):
test_data = {"surls": [{"size": 1, "surl": "test"}, test_data = {"surls": [{"size": 1, "surl": "test"},
...@@ -94,4 +94,30 @@ class ComputeInputsHistogram(unittest.TestCase): ...@@ -94,4 +94,30 @@ class ComputeInputsHistogram(unittest.TestCase):
self.assertEqual(1000000000000000000, max_size) self.assertEqual(1000000000000000000, max_size)
self.assertEqual(5, biggest_bucket) self.assertEqual(5, biggest_bucket)
self.assertListEqual([5, 0, 0, 0, 0, 1], counts) # TODO: if this is the case, adapt it to logarithmic scale self.assertListEqual([5, 0, 0, 0, 0, 1], counts) # TODO: if this is the case, adapt it to logarithmic scale
self.assertListEqual(['1.0B', '148.0PB', '296.1PB', '444.1PB', '592.1PB', '740.1PB', '888.2PB'], bins) self.assertListEqual(['1.000B', '148.030PB', '296.059PB', '444.089PB', '592.119PB', '740.149PB', '888.178PB'], bins)
def test_two_ranges(self):
test_data = {"surls": [{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97873920, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"},
{"size": 97884160, "surl": "test"}]}
min_size, max_size, n_bins, counts, biggest_bucket, bins = compute_inputs_histogram(test_data)
self.assertEqual(2, n_bins)
self.assertEqual(97873920, min_size)
self.assertEqual(97884160, max_size)
self.assertEqual(8, biggest_bucket)
self.assertListEqual([8,7], counts) # TODO: if this is the case, adapt it to logarithmic scale
self.assertListEqual(['93.340MB', '93.345MB', '93.350MB'], bins)
...@@ -73,7 +73,8 @@ def compute_inputs_histogram(inputs): ...@@ -73,7 +73,8 @@ def compute_inputs_histogram(inputs):
min_size = inputs_sizes.min() min_size = inputs_sizes.min()
max_size = inputs_sizes.max() max_size = inputs_sizes.max()
n_bins = 1 if min_size == max_size else (inputs_sizes.__len__() if inputs_sizes.__len__() < 100 else 100) n_distinct_sizes = numpy.unique(inputs_sizes).__len__()
n_bins = n_distinct_sizes if n_distinct_sizes < 100 else 100
counts, buckets = numpy.histogram(inputs_sizes, bins=n_bins, range=(min_size, max_size)) counts, buckets = numpy.histogram(inputs_sizes, bins=n_bins, range=(min_size, max_size))
formatted_bins = [format_size(bucket) % bucket for bucket in buckets] formatted_bins = [format_size(bucket) % bucket for bucket in buckets]
...@@ -85,7 +86,7 @@ def format_size(num, suffix="B"): ...@@ -85,7 +86,7 @@ def format_size(num, suffix="B"):
return "-" return "-"
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
if abs(num) < 1024.0: if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}" return f"{num:3.3f}{unit}{suffix}"
num /= 1024.0 num /= 1024.0
return f"{num:.1f}Yi{suffix}" return f"{num:.1f}Yi{suffix}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment