Crawler works

This commit is contained in:
farhan 2020-10-29 03:08:30 +00:00
parent d5a20f836e
commit 5ce495b81c

View File

@ -104,6 +104,15 @@ func parseCommand(c net.Conn) {
}
*/
func AppendIfMissing(hay []string, needle string) []string {
for _, ele := range hay {
if ele == needle {
return hay
}
}
return append(hay, needle)
}
func StartInstancePoll(endpoint string, min_id string, reportPostChan chan ReportPost, pollMessageChan chan PollMessage, reportInstanceChan chan ReportInstance) {
p := bluemonday.NewPolicy()
newposts := make([]ReportPost, 0)
@ -124,6 +133,7 @@ func StartInstancePoll(endpoint string, min_id string, reportPostChan chan Repor
body, err := ioutil.ReadAll(resp.Body)
err = json.Unmarshal(body, &newposts)
if err != nil {
fmt.Println("Unmarshal 3");
// Perhaps get rid of this if-condition?
if resp.StatusCode >= 400 && resp.StatusCode < 500 {
reportInstanceChan <- ReportInstance{endpoint, endpoint, resp.StatusCode}
@ -132,20 +142,25 @@ func StartInstancePoll(endpoint string, min_id string, reportPostChan chan Repor
} else {
reportInstanceChan <- ReportInstance{endpoint, endpoint, UNMARSHAL_ERROR}
}
log.Fatal(err)
//log.Fatal(err)
return
}
newinstances := make([]string, 0)
numposts := 0
for _, newpost := range newposts {
posthash := sha1.New()
if strings.Contains(newpost.Account.Acct, "@") == false {
at_sign := strings.Index(newpost.Account.Acct, "@")
if at_sign == -1 {
at_sign = len(newpost.Account.Acct)
newpost.Account.Acct += "@" + endpoint
}
// Calculate the post hash
fmt.Fprint(posthash, newpost.Url)
fmt.Fprint(posthash, newpost.Content)
fmt.Fprint(posthash, newpost.StrippedContent)
fmt.Fprint(posthash, newpost.Account.Acct)
fmt.Fprint(posthash, newpost.Account.Display_name)
fmt.Fprint(posthash, newpost.Account.Url)
@ -160,8 +175,22 @@ func StartInstancePoll(endpoint string, min_id string, reportPostChan chan Repor
min_id = newpost.Id
}
numposts = numposts + 1
newinstance := newpost.Account.Acct[at_sign+1:]
newinstances = AppendIfMissing(newinstances, newinstance)
}
for _, newinstance := range newinstances {
var q ReportInstance
q.from = endpoint
q.endpoint = newinstance
q.status = NEW_INSTANCE
reportInstanceChan <- q
}
fmt.Println(newinstances)
pollMessageChan <- PollMessage{endpoint, resp.StatusCode, min_id, numposts}
}
@ -180,6 +209,7 @@ func StartGetPeers(endpoint string, reportInstanceChan chan ReportInstance) {
err = json.Unmarshal([]byte(body), &newpeers)
if err != nil {
fmt.Println("Unmarshal 1");
log.Fatal(err)
reportInstanceChan <- ReportInstance{endpoint, endpoint, UNMARSHAL_ERROR}
return
@ -229,8 +259,8 @@ func GetNodeInfo(endpoint string, nodeinfo *NodeInfo) {
body, err := ioutil.ReadAll(resp.Body)
err = json.Unmarshal(body, &nodeinfo)
fmt.Println("Body: " + string(body))
if err != nil {
fmt.Println("Unmarshal 2");
return
}
}
@ -285,12 +315,10 @@ func writePost(pool *pgxpool.Pool, reportpost ReportPost) {
os.Exit(1) // For now I want this to die and learn why it failed
return
}
fmt.Println("Properly executed")
}
func SuspendInstance(suspendinstance ReportInstance, runninginstances *[]RunningInstance) {
fmt.Println("Suspend")
for _, runninginstance := range *runninginstances {
if runninginstance.endpoint == suspendinstance.endpoint {
runninginstance.status = suspendinstance.status
@ -358,6 +386,7 @@ func main() {
go writePost(pool, v)
case w := <-reportInstanceChan: // Start or suspend instance
if w.status == NEW_INSTANCE {
fmt.Println("NEW INSTANCE: ", w.endpoint)
NewInstance(w.endpoint, &runninginstances, reportInstanceChan, reportPostChan, pollMessageChan)
} else {
SuspendInstance(w, &runninginstances)