fedilogue/fedilogger/retrieve.go

288 lines
7.5 KiB
Go

package main
import (
"context"
"encoding/json"
"html"
"io/ioutil"
"net/http"
"regexp"
"strings"
"time"
"github.com/microcosm-cc/bluemonday"
)
var p *bluemonday.Policy
var spaceReg *regexp.Regexp
var removeHTMLReg *regexp.Regexp
var re *regexp.Regexp
var matchurl *regexp.Regexp
type ImageType struct {
Url string `json:"url"`
}
type PublicKeyType struct {
PublicKeyPem string `json:"publicKeyPem"`
}
type ActorJson struct {
id int
Uri string `json:"id"`
Type string `json:"type"`
Inbox string `json:"inbox"`
Outbox string `json:"outbox"`
Followers string `json:"followers"`
Following string `json:"following"`
Url string `json:"url"`
PreferredUsername string `json:"preferredUsername"`
Name string `json:"name"`
Summary string `json:"summary"`
Icon ImageType `json:"icon"`
Image ImageType `json:"image"`
PublicKey PublicKeyType `json:"publicKey"`
bot bool
instance string
}
type TagType struct {
Type string `json:"type"`
Name string `json:"name"`
}
type PostJson struct {
id int
Uri string `json:"id"`
InReplyTo string `json:"inReplyTo"`
normalized string
receivedAt time.Time `json:"created_at"`
Content string `json:"content"`
Conversation string `json:"conversation"`
Published time.Time `json:"published"`
Summary string `json:"summary"`
Tag []TagType `json:"tag"`
To []string `json:"to"`
Type string `json:"type"`
Actor string `json:"actor"`
AttributedTo string `json:"attributedTo"`
bot bool
instance string
}
func check_activity(uri string) {
logDebug("Retrieving: " + uri)
var activityjson PostJson
// Ignore invalid URIs
endslash := strings.Index(uri[8:], "/")
if endslash == -1 {
return
}
activityjson.instance = uri[8 : endslash+8]
o, _ := GetRunner(activityjson.instance)
if o.Banned == true {
logDebug("Ignoring banned instance: ", uri)
return // Banned instance
}
// Check if there were any recent requests on this
o.Recentactivities.Mu.Lock()
i, _ := o.Recentactivities.Contains(uri)
if i != -1 {
logDebug("Ignoring cached recent request: ", uri)
o.Recentactivities.Mu.Unlock()
return
}
o.Recentactivities.Add(uri, "") // Added blank entry
o.Recentactivities.Mu.Unlock()
var jsondocument string
selectRet := pool.QueryRow(context.Background(), "SELECT FROM activities WHERE document->>'id' = $1", uri)
err := selectRet.Scan()
if err == nil {
logDebug("Already in database, ignoring: ", uri)
return
}
req, _ := http.NewRequest("GET", uri, nil)
req.Header.Set("User-Agent", "Tusky")
req.Header.Add("Accept", "application/ld+json")
resp, err := DoTries(&o, req)
if err != nil {
logDebug("Gave up after multiple tries: ", uri)
return
}
if resp.StatusCode != 200 {
logDebug("Non-200 response code for ", uri, " was ", resp.StatusCode)
resp.Body.Close()
return
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
logDebug("Failed to read the reply: ", uri)
return
}
resp.Body.Close()
jsondocument = string(body)
err = json.Unmarshal(body, &activityjson)
if err != nil {
logDebug("Failed to Unmarshal, err: ", err, " uri: ", uri)
return
}
if activityjson.InReplyTo != "" && activityjson.InReplyTo != uri {
if activityjson.InReplyTo != uri {
go check_activity(activityjson.InReplyTo)
}
}
// If AttributedTo is blank, this is likely an authentication failure
// For now, skip it...
if activityjson.AttributedTo == "" {
logDebug("AttributedTo field is blank, dropping for ", uri)
return
}
// This must be done BEFORE the `INSERT INTO activities'` below
actorjson := check_actor(activityjson.AttributedTo)
if actorjson == nil {
logDebug("Failed to add actor, dropping post: ", uri)
return
}
if actorjson.bot || o.Alwaysbot {
activityjson.bot = true
}
activityjson.normalized = removeHTMLReg.ReplaceAllString(activityjson.Content, " ")
activityjson.normalized = html.UnescapeString(strings.ToLower(p.Sanitize(activityjson.normalized)))
activityjson.normalized = matchurl.ReplaceAllString(activityjson.normalized, "")
activityjson.normalized = spaceReg.ReplaceAllString(activityjson.normalized, " ")
var hashtags []string
for _, tag := range activityjson.Tag {
if tag.Type == "Hashtag" {
hashtags = append(hashtags, strings.ToLower(tag.Name))
}
}
_, err = pool.Exec(context.Background(), "INSERT INTO activities (document, normalized, instance, hashtags, bot) VALUES($1, $2, $3, $4, $5)", jsondocument, activityjson.normalized, activityjson.instance, hashtags, activityjson.bot)
if err != nil {
logWarn("Error inserting ", uri, " into `activities`: ", err)
return
}
for _, to := range activityjson.To {
if to != "https://www.w3.org/ns/activitystreams#Public" && to != "" {
if strings.HasSuffix(to, "/followers") {
// This check is very much a bad solution, may consider removing the entire for-loop
continue
}
go check_actor(to)
}
}
}
/* Test: TestCheck_actor */
func check_actor(uri string) *ActorJson {
actorjson := &ActorJson{}
if len(uri) <= 7 {
return nil // Bad actor
}
endslash := strings.Index(uri[8:], "/")
if endslash == -1 {
return nil // Bad actor
}
actorjson.instance = uri[8 : endslash+8]
// Check if there were any recent requests on this
o, _ := GetRunner(actorjson.instance)
if o.Banned {
logDebug("Banned actor: ", uri)
return nil // Banned actor
}
o.Recentactors.Mu.Lock()
i, cachedactorjson := o.Recentactors.Contains(uri)
if i != -1 {
o.Recentactors.Mu.Unlock()
cachedactorjson := cachedactorjson.(*ActorJson)
return cachedactorjson
}
o.Recentactors.Mu.Unlock()
selectRet := pool.QueryRow(context.Background(), "SELECT document FROM actors WHERE document->>'id' = $1", uri)
err := selectRet.Scan(&actorjson)
if err == nil {
return actorjson // Actor already in database, good!
}
req, _ := http.NewRequest("GET", uri, nil)
req.Header.Set("User-Agent", "Tusky")
req.Header.Add("Accept", "application/ld+json")
var resp *http.Response
tries := 0
for {
resp, err = o.Client.Do(req)
if err != nil {
if tries > 10 {
logErr("Unable to connect to " + uri + " attempt 10/10, giving up.")
return nil // Unable to connect to host after 10 attempts
}
logWarn("Unable to connect to "+uri+", attempt ", tries+1, "+/10 sleeping for 30 seconds.")
time.Sleep(time.Second * 30)
tries = tries + 1
continue
}
break
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
logWarn("Unable to read body from ", uri)
return nil // Unable to read body of message
}
resp.Body.Close()
jsondocument := string(body)
err = json.Unmarshal(body, &actorjson)
if err != nil {
logWarn("Unable to unmarshal body from ", uri)
return nil // Unable to unmarshal body of message
}
o.Recentactors.Mu.Lock()
o.Recentactors.Add(uri, actorjson)
o.Recentactors.Mu.Unlock()
var bot bool
if actorjson.Type == "Service" {
actorjson.bot = true
} else {
actorjson.bot = o.Alwaysbot // default on host's classification
}
_, err = pool.Exec(context.Background(), "INSERT INTO actors (document, instance, bot) VALUES($1, $2, $3)", jsondocument, actorjson.instance, bot)
if err != nil {
logWarn("Error inserting ", uri, " into `actors`: ", err)
return nil // Unable to insert actor
}
o.Recentactors.Mu.Lock()
o.Recentactors.Add(uri, actorjson)
o.Recentactors.Mu.Unlock()
return actorjson // Successful
}