This repository has been archived on 2022-03-22. You can view files and clone it, but cannot push or open issues or pull requests.
amber-ebooks-archived/markov.go

215 lines
4.4 KiB
Go

package main
import (
"context"
"encoding/gob"
"log"
"math/rand"
"os"
"path"
"strconv"
"strings"
"sync"
"unicode"
mastodon "github.com/mattn/go-mastodon"
)
const markovPrefixLength = 2
type probableString struct {
Order []string
Count map[string]int
Total int
}
func (ps probableString) insert(s string) probableString {
if n, ok := ps.Count[s]; ok {
ps.Count[s] = n + 1
ps.Total++
return ps
}
if ps.Count == nil {
ps.Count = make(map[string]int)
}
ps.Count[s] = 1
ps.Order = append(ps.Order, s)
ps.Total++
return ps
}
func (ps probableString) rand(r *rand.Rand) string {
n := r.Intn(ps.Total)
for _, s := range ps.Order {
n -= ps.Count[s]
if n < 0 {
return s
}
}
panic("unreachable")
}
var markovDirty = make(chan struct{}, 1)
func loadData() {
markovLock.Lock()
defer markovLock.Unlock()
f, err := os.Open(*flagData)
if os.IsNotExist(err) {
return
}
checkError(err, "Could not open data cache")
defer func() {
checkError(f.Close(), "Could not close data cache")
}()
checkError(gob.NewDecoder(f).Decode(&markov), "Could not read data")
}
func saveMarkov() {
markovLock.Lock()
defer markovLock.Unlock()
f, err := os.Create(*flagData + ".tmp")
checkError(err, "Could not create data staging file")
checkError(gob.NewEncoder(f).Encode(&markov), "Could not save data")
checkError(f.Close(), "Could not close data staging file")
checkError(os.Rename(*flagData+".tmp", *flagData), "Could not commit data update")
}
var markovLock sync.Mutex
var markov = struct {
Accounts map[mastodon.ID]accountCache
Next map[[markovPrefixLength]string]probableString
Prev map[[markovPrefixLength]string]probableString
}{
Accounts: make(map[mastodon.ID]accountCache),
Next: make(map[[markovPrefixLength]string]probableString),
Prev: make(map[[markovPrefixLength]string]probableString),
}
func insertStatus(ctx context.Context, account mastodon.ID, id, content string) {
content = cleanContent(content)
paragraphs := strings.Split(content, "\n\n")
markovLock.Lock()
defer markovLock.Unlock()
cache := markov.Accounts[account]
if n, err := strconv.ParseUint(path.Base(id), 10, 64); err == nil && n > cache.LatestRemoteTootID {
cache.LatestRemoteTootID = n
markov.Accounts[account] = cache
}
for _, p := range paragraphs {
updateMarkov(strings.Fields(p))
}
select {
case markovDirty <- struct{}{}:
default:
}
}
func updateMarkov(words []string) {
const last = markovPrefixLength - 1
var prefix [markovPrefixLength]string
for _, word := range words {
markov.Next[prefix] = markov.Next[prefix].insert(word)
copy(prefix[:], prefix[1:])
prefix[last] = normalizeWord(word)
}
markov.Next[prefix] = markov.Next[prefix].insert("")
for i := 1; i < len(words); i++ {
for l := 1; l <= markovPrefixLength; l++ {
var suffix [markovPrefixLength]string
copy(suffix[:l], words[i:])
for j := range suffix {
suffix[j] = normalizeWord(suffix[j])
}
markov.Prev[suffix] = markov.Prev[suffix].insert(words[i-1])
}
}
for l := 1; l <= markovPrefixLength; l++ {
var suffix [markovPrefixLength]string
copy(suffix[:l], words)
for j := range suffix {
suffix[j] = normalizeWord(suffix[j])
}
markov.Prev[suffix] = markov.Prev[suffix].insert("")
}
}
func genMarkov(r *rand.Rand, seed string) []string {
const last = markovPrefixLength - 1
var prefix [markovPrefixLength]string
var line []string
markovLock.Lock()
defer markovLock.Unlock()
if seed != "" {
var suffix [markovPrefixLength]string
suffix[0] = normalizeWord(seed)
ps, ok := markov.Prev[suffix]
if !ok {
return nil
}
line = append(line, seed)
for len(line) < 1000 {
s := ps.rand(r)
if s == "" {
break
}
line = append(line, s)
copy(suffix[1:], suffix[:])
suffix[0] = normalizeWord(s)
ps = markov.Prev[suffix]
}
for i, j := 0, len(line)-1; i < j; i, j = i+1, j-1 {
line[i], line[j] = line[j], line[i]
}
for i, j := len(line)-1, last; i >= 0 && j >= 0; i, j = i-1, j-1 {
prefix[j] = normalizeWord(line[i])
}
}
if _, ok := markov.Next[prefix]; !ok {
log.Panicln("No markov data available for prefix", prefix, line)
}
for len(line) < 1000 {
ps := markov.Next[prefix]
s := ps.rand(r)
if s == "" {
return line
}
line = append(line, s)
copy(prefix[:], prefix[1:])
prefix[last] = normalizeWord(s)
}
// probably an infinite loop
return line
}
func normalizeWord(s string) string {
return strings.Join(strings.FieldsFunc(strings.ToLower(s), unicode.IsPunct), "")
}