data_hacks icon indicating copy to clipboard operation
data_hacks copied to clipboard

histogram.py errors if all values are equal

Open cgmb opened this issue 9 years ago • 3 comments

histogram.py errors if all values are equal. e.g.

vagrant@localhost:~/ws$ echo -e '16\n16\n16\n' | ~/ws/bitly/data_hacks/histogram.py
Traceback (most recent call last):
  File "/home/vagrant/ws/bitly/data_hacks/histogram.py", line 300, in <module>
    options.agg_key_value), options)
  File "/home/vagrant/ws/bitly/data_hacks/histogram.py", line 150, in histogram
    raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
ValueError: max must be > min. max:16 min:16

cgmb avatar Jan 31 '16 00:01 cgmb

Hi, I'd like to fix this issue, please let me know how I can contribute :)

havanagrawal avatar Apr 20 '16 15:04 havanagrawal

This error is also hit in the corner case where there is only one value input.

shabbybanks avatar Nov 07 '16 22:11 shabbybanks

The following patch works for me, but is a little wonky in terms of Decimal math:

--- histogram.py  2016-11-04 14:54:19.092902000 -0400
+++ fixed_histogram.py  2016-11-07 17:35:01.707723906 -0500
@@ -139,9 +139,16 @@
         max_v = max(data, key=lambda x: x.value)
         max_v = max_v.value

-    if not max_v > min_v:
-        raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
-    diff = max_v - min_v
+    fake_max = max_v
+    fake_min = min_v
+
+    if not fake_max > fake_min:
+        print >>sys.stderr, 'max must be > min. max:%s min:%s' % (max_v, min_v)
+        fake_min = max(fake_min - Decimal("0.5"),Decimal("0.5") * fake_min)
+        fake_max = min(fake_max + Decimal("0.5"),Decimal("2.0") * fake_max)
+
+
+    diff = fake_max - fake_min

     boundaries = []
     bucket_counts = []
@@ -152,18 +159,18 @@
         bound_sort = sorted(map(Decimal, bound))

         # if the last value is smaller than the maximum, replace it
-        if bound_sort[-1] < max_v:
-            bound_sort[-1] = max_v
+        if bound_sort[-1] < fake_max:
+            bound_sort[-1] = fake_max

         # iterate through the sorted list and append to boundaries
         for x in bound_sort:
-            if x >= min_v and x <= max_v:
+            if x >= fake_min and x <= fake_max:
                 boundaries.append(x)
-            elif x >= max_v:
-                boundaries.append(max_v)
+            elif x >= fake_max:
+                boundaries.append(fake_max)
                 break

-        # beware: the min_v is not included in the boundaries, so no need to do a -1!
+        # beware: the fake_min is not included in the boundaries, so no need to do a -1!
         bucket_counts = [0 for x in range(len(boundaries))]
         buckets = len(boundaries)
     else:
@@ -173,7 +180,7 @@
         step = diff / buckets
         bucket_counts = [0 for x in range(buckets)]
         for x in range(buckets):
-            boundaries.append(min_v + (step * (x + 1)))
+            boundaries.append(fake_min + (step * (x + 1)))

     skipped = 0
     samples = 0
@@ -185,7 +192,7 @@
             mvsd.add(record.value, record.count)
             accepted_data.append(record)
         # find the bucket this goes in
-        if record.value < min_v or record.value > max_v:
+        if record.value < fake_min or record.value > fake_max:
             skipped += record.count
             continue
         for bucket_postion, boundary in enumerate(boundaries):
@@ -203,8 +210,8 @@
     if options.mvsd:
         print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value)
)
     print "# each ∎ represents a count of %d" % bucket_scale
-    bucket_min = min_v
-    bucket_max = min_v
+    bucket_min = fake_min
+    bucket_max = fake_min
     percentage = ""
     format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
     for bucket in range(buckets):

shabbybanks avatar Nov 07 '16 22:11 shabbybanks