// Package histogram provides a Go implementation of BigML's histogram package // for Clojure/Java. It is currently experimental. package histogram import ( "container/heap" "math" "sort" ) type Bin struct { Count int Sum float64 } func (b *Bin) Update(x *Bin) { b.Count += x.Count b.Sum += x.Sum } func (b *Bin) Mean() float64 { return b.Sum / float64(b.Count) } type Bins []*Bin func (bs Bins) Len() int { return len(bs) } func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() } func (bs Bins) Swap(i, j int) { bs[i], bs[j] = bs[j], bs[i] } func (bs *Bins) Push(x interface{}) { *bs = append(*bs, x.(*Bin)) } func (bs *Bins) Pop() interface{} { return bs.remove(len(*bs) - 1) } func (bs *Bins) remove(n int) *Bin { if n < 0 || len(*bs) < n { return nil } x := (*bs)[n] *bs = append((*bs)[:n], (*bs)[n+1:]...) return x } type Histogram struct { res *reservoir } func New(maxBins int) *Histogram { return &Histogram{res: newReservoir(maxBins)} } func (h *Histogram) Insert(f float64) { h.res.insert(&Bin{1, f}) h.res.compress() } func (h *Histogram) Bins() Bins { return h.res.bins } type reservoir struct { n int maxBins int bins Bins } func newReservoir(maxBins int) *reservoir { return &reservoir{maxBins: maxBins} } func (r *reservoir) insert(bin *Bin) { r.n += bin.Count i := sort.Search(len(r.bins), func(i int) bool { return r.bins[i].Mean() >= bin.Mean() }) if i < 0 || i == r.bins.Len() { // TODO(blake): Maybe use an .insert(i, bin) instead of // performing the extra work of a heap.Push. heap.Push(&r.bins, bin) return } r.bins[i].Update(bin) } func (r *reservoir) compress() { for r.bins.Len() > r.maxBins { minGapIndex := -1 minGap := math.MaxFloat64 for i := 0; i < r.bins.Len()-1; i++ { gap := gapWeight(r.bins[i], r.bins[i+1]) if minGap > gap { minGap = gap minGapIndex = i } } prev := r.bins[minGapIndex] next := r.bins.remove(minGapIndex + 1) prev.Update(next) } } func gapWeight(prev, next *Bin) float64 { return next.Mean() - prev.Mean() }