From f262de1dc329365f08963ea7476a8cbaf53b4b4d Mon Sep 17 00:00:00 2001
From: Farhan Khan <farhan@farhan.codes>
Date: Mon, 1 Feb 2021 20:31:40 +0000
Subject: [PATCH] migrating to storing data as jsonb object in database
 captures all data, avoids cache-misses

---
 retrieve.go | 39 +++++++++++++++++++++++----------------
 tables.sql  | 33 +++++++++++++--------------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/retrieve.go b/retrieve.go
index 37f315f..4c1fa91 100644
--- a/retrieve.go
+++ b/retrieve.go
@@ -6,7 +6,6 @@ import (
 	"errors"
 	"html"
 	"io/ioutil"
-	"io"
 	"net/http"
 	"strings"
 	"time"
@@ -21,7 +20,6 @@ var re *regexp.Regexp
 var matchurl *regexp.Regexp
 
 type ImageType struct {
-	//	Type	string `json:"type"`
 	Url string `json:"url"`
 }
 
@@ -93,8 +91,10 @@ func check_activity(uri string) (PostJson, error) {
 			return activityjson, errors.New("Recently requested within local cache")
 	}
 
-	selectRet := pool.QueryRow(context.Background(), "SELECT id, inReplyTo, published, summary, content, normalized, attributedto, received_at FROM activities WHERE id = $1", uri)
-	err := selectRet.Scan(&activityjson.ID, &activityjson.InReplyTo, &activityjson.Published, &activityjson.Summary, &activityjson.Content, &activityjson.normalized, &activityjson.AttributedTo, &activityjson.receivedAt)
+	var jsondocument string
+
+	selectRet := pool.QueryRow(context.Background(), "SELECT document, normalized FROM activities WHERE document->'id' = $1", uri)
+	err := selectRet.Scan(&activityjson.ID, &jsondocument, &activityjson.normalized)
 	if err == nil {
 		return activityjson, nil
 	}
@@ -114,6 +114,8 @@ func check_activity(uri string) (PostJson, error) {
 	}
 	resp.Body.Close()
 
+	jsondocument = string(body)
+
 	err = json.Unmarshal(body, &activityjson)
 	if err != nil {
 		return activityjson, err
@@ -127,7 +129,6 @@ func check_activity(uri string) (PostJson, error) {
 
 	// If AttributedTo is blank, this is likely an authentication failure
 	// For now, skip it...
-
 	if activityjson.AttributedTo == "" {
 		return activityjson, errors.New("Invalid AttributedTo value on " + uri)
 	}
@@ -142,7 +143,7 @@ func check_activity(uri string) (PostJson, error) {
 	activityjson.normalized = matchurl.ReplaceAllString(activityjson.normalized, "")
 	activityjson.normalized = spaceReg.ReplaceAllString(activityjson.normalized, " ")
 
-	_, err = pool.Exec(context.Background(), "INSERT INTO activities (id, inreplyto, published, summary, content, normalized, attributedto, instance) VALUES($1, $2, $3, $4, $5, $6, $7, $8)", activityjson.ID, activityjson.InReplyTo, activityjson.Published, activityjson.Summary, activityjson.Content, activityjson.normalized, activityjson.AttributedTo, activityjson.instance)
+	_, err = pool.Exec(context.Background(), "INSERT INTO activities (document, normalized, instance) VALUES($1, $2, $3)", jsondocument, activityjson.normalized, activityjson.instance)
 	if err != nil {
 		logDebug.Print(err)
 		return activityjson, err
@@ -169,8 +170,9 @@ func check_actor(uri string) (ActorJson, error) {
 		}
 	}
 
-	selectRet := pool.QueryRow(context.Background(), "SELECT id, actor_type, inbox, outbox, followers, following, url, preferredUsername, name, summary, icon, image, publicKey, instance FROM actors WHERE id = $1", uri)
-	err := selectRet.Scan(&actorjson.ID, &actorjson.Type, &actorjson.Inbox, &actorjson.Outbox, &actorjson.Followers, &actorjson.Following, &actorjson.Url, &actorjson.PreferredUsername, &actorjson.Name, &actorjson.Summary, &actorjson.Icon.Url, &actorjson.Image.Url, &actorjson.PublicKey.PublicKeyPem, &actorjson.instance)
+	var jsondocument string
+	selectRet := pool.QueryRow(context.Background(), "SELECT document, instance FROM actors WHERE document->'id' = $1", uri)
+	err := selectRet.Scan(&actorjson.ID, &jsondocument, &actorjson.instance)
 	if err == nil {
 		return actorjson, nil
 	}
@@ -180,6 +182,7 @@ func check_actor(uri string) (ActorJson, error) {
 	}
 	actorjson.instance = uri[8 : endslash+8]
 
+logDebug.Print("CHECK: " + uri)
 	o, _ := GetRunner(actorjson.instance)
 	req, _ := http.NewRequest("GET", uri, nil)
 	req.Header.Set("User-Agent", "Tusky")
@@ -202,19 +205,23 @@ func check_actor(uri string) (ActorJson, error) {
 		break
 	}
 
-	err = json.NewDecoder(resp.Body).Decode(&actorjson)
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return actorjson, errors.New("Read error on " + uri)
+	}
+	resp.Body.Close()
+
+	jsondocument = string(body)
+	//logDebug.Print(string(jsondocument))
+
+	err = json.Unmarshal(body, &actorjson)
 	if err != nil {
-		// Going forward, this might need to be double-checked, but for now just die
-		tries = tries + 1
 		return actorjson, err
 	}
 
-	io.Copy(ioutil.Discard, resp.Body)
-
-	resp.Body.Close()
-
-	_, err = pool.Exec(context.Background(), "INSERT INTO actors (id, actor_type, inbox, outbox, followers, following, url, preferredUsername, name, summary, icon, image, publicKey, instance) VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)", actorjson.ID, actorjson.Type, actorjson.Inbox, actorjson.Outbox, actorjson.Followers, actorjson.Following, actorjson.Url, actorjson.PreferredUsername, actorjson.Name, actorjson.Summary, actorjson.Icon.Url, actorjson.Image.Url, actorjson.PublicKey.PublicKeyPem, actorjson.instance)
+	_, err = pool.Exec(context.Background(), "INSERT INTO actors (document, instance) VALUES($1, $2)", jsondocument, actorjson.instance)
 	if err != nil {
+		logDebug.Print(err)
 		return actorjson, err
 	}
 
diff --git a/tables.sql b/tables.sql
index 20f4a69..00226b8 100644
--- a/tables.sql
+++ b/tables.sql
@@ -3,35 +3,23 @@ DROP TABLE IF EXISTS actors CASCADE;
 DROP TABLE IF EXISTS instances CASCADE;
 
 CREATE TABLE actors (
-    actor_type VARCHAR(1000) NOT NULL,
-    id VARCHAR(2083) NOT NULL PRIMARY KEY UNIQUE,
-    inbox VARCHAR(2083) NOT NULL,
-    outbox VARCHAR(2083) NOT NULL,
-    followers VARCHAR(2083) NOT NULL,
-    following VARCHAR(2083) NOT NULL,
-    url VARCHAR(2083) NOT NULL,
-    preferredusername VARCHAR(1000) NOT NULL,
-    name VARCHAR(1000) NOT NULL,
-    summary TEXT,
-    icon VARCHAR(2083),
-    image VARCHAR(2083),
-    publickey TEXT,
+	id SERIAL PRIMARY KEY,
+	document JSONB,
     identifiedat TIMESTAMP with time zone DEFAULT now(),
     instance VARCHAR(1000) NOT NULL
 );
 
+CREATE UNIQUE INDEX actors_uri ON actors ( (document->'id') );
+
 CREATE TABLE activities (
-    id VARCHAR(2083) NOT NULL PRIMARY KEY UNIQUE,
-    inreplyto VARCHAR(2083),
-    published TIMESTAMP with time zone NOT NULL,
-    summary TEXT,
-    content TEXT,
+	id SERIAL PRIMARY KEY,
+	document JSONB,
     normalized TEXT,
-    attributedto VARCHAR(2083) REFERENCES actors,
-    received_at TIMESTAMP with time zone DEFAULT now(),
     instance VARCHAR(1000) NOT NULL
 );
 
+CREATE UNIQUE INDEX activities_uri ON activities ( (document->'id') );
+
 CREATE TABLE instances (
     endpoint VARCHAR(2083) NOT NULL PRIMARY KEY UNIQUE,
     autostart BOOLEAN,
@@ -44,3 +32,8 @@ CREATE TABLE instances (
 ALTER TABLE activities ADD COLUMN normalized_idx tsvector;
 UPDATE activities SET normalized_idx = to_tsvector('english', normalized);
 CREATE INDEX ON activities USING gin(normalized_idx);
+
+CREATE INDEX actors_id_idx ON actors (id);
+CREATE INDEX activities_id_idx ON activities (id);
+CREATE INDEX actors_uri_idx ON actors ( (document->'id') );
+CREATE INDEX activities_uri_idx ON activities ( (document->'id') );