data_hacks
data_hacks copied to clipboard
histogram.py errors if all values are equal
histogram.py errors if all values are equal. e.g.
vagrant@localhost:~/ws$ echo -e '16\n16\n16\n' | ~/ws/bitly/data_hacks/histogram.py
Traceback (most recent call last):
File "/home/vagrant/ws/bitly/data_hacks/histogram.py", line 300, in <module>
options.agg_key_value), options)
File "/home/vagrant/ws/bitly/data_hacks/histogram.py", line 150, in histogram
raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
ValueError: max must be > min. max:16 min:16
Hi, I'd like to fix this issue, please let me know how I can contribute :)
This error is also hit in the corner case where there is only one value input.
The following patch works for me, but is a little wonky in terms of Decimal math:
--- histogram.py 2016-11-04 14:54:19.092902000 -0400
+++ fixed_histogram.py 2016-11-07 17:35:01.707723906 -0500
@@ -139,9 +139,16 @@
max_v = max(data, key=lambda x: x.value)
max_v = max_v.value
- if not max_v > min_v:
- raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
- diff = max_v - min_v
+ fake_max = max_v
+ fake_min = min_v
+
+ if not fake_max > fake_min:
+ print >>sys.stderr, 'max must be > min. max:%s min:%s' % (max_v, min_v)
+ fake_min = max(fake_min - Decimal("0.5"),Decimal("0.5") * fake_min)
+ fake_max = min(fake_max + Decimal("0.5"),Decimal("2.0") * fake_max)
+
+
+ diff = fake_max - fake_min
boundaries = []
bucket_counts = []
@@ -152,18 +159,18 @@
bound_sort = sorted(map(Decimal, bound))
# if the last value is smaller than the maximum, replace it
- if bound_sort[-1] < max_v:
- bound_sort[-1] = max_v
+ if bound_sort[-1] < fake_max:
+ bound_sort[-1] = fake_max
# iterate through the sorted list and append to boundaries
for x in bound_sort:
- if x >= min_v and x <= max_v:
+ if x >= fake_min and x <= fake_max:
boundaries.append(x)
- elif x >= max_v:
- boundaries.append(max_v)
+ elif x >= fake_max:
+ boundaries.append(fake_max)
break
- # beware: the min_v is not included in the boundaries, so no need to do a -1!
+ # beware: the fake_min is not included in the boundaries, so no need to do a -1!
bucket_counts = [0 for x in range(len(boundaries))]
buckets = len(boundaries)
else:
@@ -173,7 +180,7 @@
step = diff / buckets
bucket_counts = [0 for x in range(buckets)]
for x in range(buckets):
- boundaries.append(min_v + (step * (x + 1)))
+ boundaries.append(fake_min + (step * (x + 1)))
skipped = 0
samples = 0
@@ -185,7 +192,7 @@
mvsd.add(record.value, record.count)
accepted_data.append(record)
# find the bucket this goes in
- if record.value < min_v or record.value > max_v:
+ if record.value < fake_min or record.value > fake_max:
skipped += record.count
continue
for bucket_postion, boundary in enumerate(boundaries):
@@ -203,8 +210,8 @@
if options.mvsd:
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value)
)
print "# each ∎ represents a count of %d" % bucket_scale
- bucket_min = min_v
- bucket_max = min_v
+ bucket_min = fake_min
+ bucket_max = fake_min
percentage = ""
format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
for bucket in range(buckets):