Files
kotaku-uk-rss/kotaku-uk-rss.go

199 lines
4.1 KiB
Go

package main
import (
"fmt"
"log"
"net/http"
"sort"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gorilla/feeds"
)
/*
Structs
*/
// Article holds various info about an article.
type Article struct {
Title string
Date time.Time
Author string
Summary string
MediaURL string
URL string
Href string
}
// Articles is a collection of Article structs.
type Articles []Article
// SectionsByArticleLength enables sorting section nodes by number of article
// child nodes.
type SectionsByArticleLength []*goquery.Selection
func (s SectionsByArticleLength) Len() int {
return len(s)
}
func (s SectionsByArticleLength) Less(i int, j int) bool {
return s[i].Find("article").Length() > s[j].Find("article").Length()
}
func (s SectionsByArticleLength) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
// RssCache is a string cache with mutex locks.
type RssCache struct {
sync.RWMutex
body string
}
// Set allows setting the RssCache body.
func (r *RssCache) Set(value string) {
r.Lock()
defer r.Unlock()
r.body = value
}
// Get allows getting the RssCache body.
func (r *RssCache) Get() string {
r.RLock()
defer r.RUnlock()
return r.body
}
/*
Functions
*/
func fetchDocument(url string) *goquery.Document {
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
}
return doc
}
func extractArticleSection(doc *goquery.Document) *goquery.Selection {
var sections = SectionsByArticleLength{}
doc.Find("section").Each(func(i int, s *goquery.Selection) {
sections = append(sections, s)
})
sort.Sort(sections)
return sections[0]
}
func parseArticleSection(section *goquery.Selection) Articles {
result := Articles{}
section.Find("article").Each(func(i int, s *goquery.Selection) {
result = append(result, parseArticle(s))
})
return result
}
func parseArticle(s *goquery.Selection) Article {
href, _ := s.Find(".media__body h2 a").Attr("href")
if href == "" {
href, _ = s.Find("figure a").Attr("href")
}
url := rootURL + href
summary := s.Find(".media__body p").Text()
title := s.Find(".media__body h2").Text()
if title == "" {
title = truncateString(summary, 60) + "..."
}
mediaURL, _ := s.Find("figure").Attr("data-media992")
timeString, _ := s.Find(".meta__limited time").Attr("datetime")
parsedTime, _ := time.Parse(time.RFC3339, timeString)
return Article{
Title: title,
Date: parsedTime,
Author: s.Find(".meta__full a.is-author").Text(),
Summary: summary,
MediaURL: mediaURL,
URL: url,
Href: href,
}
}
func truncateString(s string, l int) string {
end := len(s)
if end > l {
end = l
}
return s[:end]
}
func getArticlesFromURL(url string) Articles {
doc := fetchDocument(url)
section := extractArticleSection(doc)
return parseArticleSection(section)
}
func buildFeed(articles Articles) *feeds.Feed {
feed := &feeds.Feed{
Title: "Kotaku UK",
Link: &feeds.Link{Href: "http://www.kotaku.co.uk/"},
Description: "Kotaku UK is the UK version of Kotaku",
Created: time.Now(),
}
feed.Items = []*feeds.Item{}
for _, article := range articles {
feed.Items = append(feed.Items, buildFeedItem(article))
}
return feed
}
func buildFeedItem(article Article) *feeds.Item {
description := article.Summary
if article.MediaURL != "" {
description = fmt.Sprintf("<img href=\"%s\" /> %s",
article.MediaURL, description)
}
return &feeds.Item{
Id: article.Href,
Title: article.Title,
Link: &feeds.Link{Href: article.URL},
Description: description,
Author: &feeds.Author{Name: article.Author},
Created: article.Date,
}
}
func serveRss(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, rssCache.Get())
}
func updateRssLoop() {
for {
articles := Articles{}
for _, url := range pageUrls {
fmt.Printf("fetching and parsing articles from: %s\n", url)
articles = append(articles, getArticlesFromURL(url)...)
}
fmt.Printf("building feed cache... ")
feed, _ := buildFeed(articles).ToRss()
rssCache.Set(feed)
fmt.Println("done")
fmt.Println("taking a nap for 60 seconds ^_^")
time.Sleep(60 * time.Second)
}
}