snac2

Fork of https://codeberg.org/grunfink/snac2
git clone https://git.inz.fi/snac2
Log | Files | Refs | README | LICENSE

rss.c (8082B)


      1 /* snac - A simple, minimalistic ActivityPub instance */
      2 /* copyright (c) 2025 grunfink et al. / MIT license */
      3 
      4 #include "xs.h"
      5 #include "xs_html.h"
      6 #include "xs_regex.h"
      7 #include "xs_time.h"
      8 #include "xs_match.h"
      9 #include "xs_curl.h"
     10 #include "xs_openssl.h"
     11 #include "xs_json.h"
     12 
     13 #include "snac.h"
     14 
     15 xs_str *rss_from_timeline(snac *user, const xs_list *timeline,
     16                         const char *title, const char *link, const char *desc)
     17 /* converts a timeline to rss */
     18 {
     19     xs_html *rss = xs_html_tag("rss",
     20         xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"),
     21         xs_html_attr("version",       "2.0"),
     22         xs_html_attr("xmlns:atom",    "http:/" "/www.w3.org/2005/Atom"));
     23 
     24     xs_html *channel = xs_html_tag("channel",
     25         xs_html_tag("title",
     26             xs_html_text(title)),
     27         xs_html_tag("language",
     28             xs_html_text("en")),
     29         xs_html_tag("link",
     30             xs_html_text(link)),
     31         xs_html_sctag("atom:link",
     32             xs_html_attr("href", link),
     33             xs_html_attr("rel", "self"),
     34             xs_html_attr("type", "application/rss+xml")),
     35         xs_html_tag("generator",
     36             xs_html_text(USER_AGENT)),
     37         xs_html_tag("description",
     38             xs_html_text(desc)));
     39 
     40     xs_html_add(rss, channel);
     41 
     42     int cnt = 0;
     43     const char *v;
     44 
     45     xs_list_foreach(timeline, v) {
     46         xs *msg = NULL;
     47 
     48         if (user) {
     49             if (!valid_status(timeline_get_by_md5(user, v, &msg)))
     50                 continue;
     51         }
     52         else {
     53             if (!valid_status(object_get_by_md5(v, &msg)))
     54                 continue;
     55         }
     56 
     57         const char *id = xs_dict_get(msg, "id");
     58         const char *content = xs_dict_get(msg, "content");
     59         const char *published = xs_dict_get(msg, "published");
     60 
     61         if (user && !xs_startswith(id, user->actor))
     62             continue;
     63 
     64         if (!id || !content || !published)
     65             continue;
     66 
     67         /* create a title with the first line of the content */
     68         xs *title = xs_replace(content, "<br>", "\n");
     69         title = xs_regex_replace_i(title, "<[^>]+>", " ");
     70         title = xs_regex_replace_i(title, "&[^;]+;", " ");
     71         int i;
     72 
     73         for (i = 0; title[i] && title[i] != '\n' && i < 50; i++);
     74 
     75         if (title[i] != '\0') {
     76             title[i] = '\0';
     77             title = xs_str_cat(title, "...");
     78         }
     79 
     80         title = xs_strip_i(title);
     81 
     82         /* convert the date */
     83         time_t t = xs_parse_iso_date(published, 0);
     84         xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000");
     85 
     86         /* if it's the first one, add it to the header */
     87         if (cnt == 0)
     88             xs_html_add(channel,
     89                 xs_html_tag("lastBuildDate",
     90                     xs_html_text(rss_date)));
     91 
     92         xs_html_add(channel,
     93             xs_html_tag("item",
     94                 xs_html_tag("title",
     95                     xs_html_text(title)),
     96                 xs_html_tag("link",
     97                     xs_html_text(id)),
     98                 xs_html_tag("guid",
     99                     xs_html_text(id)),
    100                 xs_html_tag("pubDate",
    101                     xs_html_text(rss_date)),
    102                 xs_html_tag("description",
    103                     xs_html_text(content))));
    104 
    105         cnt++;
    106     }
    107 
    108     return xs_html_render_s(rss, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    109 }
    110 
    111 
    112 void rss_to_timeline(snac *user, const char *url)
    113 /* reads an RSS and inserts all ActivityPub posts into the user's timeline */
    114 {
    115     if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/"))
    116         return;
    117 
    118     xs *hdrs = xs_dict_new();
    119     hdrs = xs_dict_set(hdrs, "accept",     "application/rss+xml");
    120     hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
    121 
    122     /* get the RSS metadata */
    123     xs *md5 = xs_md5_hex(url, strlen(url));
    124     xs *rss_md_fn = xs_fmt("%s/rss", user->basedir);
    125     mkdirx(rss_md_fn);
    126     rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json");
    127 
    128     xs *rss_md = NULL;
    129     const char *etag = NULL;
    130 
    131     FILE *f;
    132     if ((f = fopen(rss_md_fn, "r")) != NULL) {
    133         rss_md = xs_json_load(f);
    134         fclose(f);
    135 
    136         etag = xs_dict_get(rss_md, "etag");
    137 
    138         if (xs_is_string(etag))
    139             hdrs = xs_dict_set(hdrs, "if-none-match", etag);
    140     }
    141 
    142     if (rss_md == NULL)
    143         rss_md = xs_dict_new();
    144 
    145     xs *payload = NULL;
    146     int status;
    147     int p_size;
    148 
    149     xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0);
    150 
    151     snac_log(user, xs_fmt("parsing RSS %s %d", url, status));
    152 
    153     if (!valid_status(status) || !xs_is_string(payload))
    154         return;
    155 
    156     /* not an RSS? done */
    157     const char *ctype = xs_dict_get(rsp, "content-type");
    158     if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1)
    159         return;
    160 
    161     /* yes, parsing is done with regexes (now I have two problems blah blah blah) */
    162     xs *links = xs_regex_select(payload, "<link>[^<]+</link>");
    163     const char *link;
    164 
    165     xs_list_foreach(links, link) {
    166         xs *l = xs_replace(link, "<link>", "");
    167         char *p = strchr(l, '<');
    168 
    169         if (p == NULL)
    170             continue;
    171         *p = '\0';
    172 
    173         /* skip this same URL */
    174         if (strcmp(l, url) == 0)
    175             continue;
    176 
    177         /* skip crap */
    178         if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/"))
    179             continue;
    180 
    181         snac_debug(user, 1, xs_fmt("RSS link: %s", l));
    182 
    183         if (timeline_here(user, l)) {
    184             snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l));
    185             continue;
    186         }
    187 
    188         /* special trick for Mastodon: convert from the alternate format */
    189         if (strchr(l, '@') != NULL) {
    190             xs *l2 = xs_split(l, "/");
    191 
    192             if (xs_list_len(l2) == 5) {
    193                 const char *uid = xs_list_get(l2, 3);
    194                 if (*uid == '@') {
    195                     xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s",
    196                         xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1));
    197 
    198                     if (timeline_here(user, guessed_id)) {
    199                         snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id));
    200                         continue;
    201                     }
    202                 }
    203             }
    204         }
    205 
    206         xs *obj = NULL;
    207 
    208         if (!valid_status(object_get(l, &obj))) {
    209             /* object is not here: bring it */
    210             if (!valid_status(activitypub_request(user, l, &obj)))
    211                 continue;
    212         }
    213 
    214         if (xs_is_dict(obj)) {
    215             const char *id      = xs_dict_get(obj, "id");
    216             const char *type    = xs_dict_get(obj, "type");
    217             const char *attr_to = get_atto(obj);
    218 
    219             if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to))
    220                 continue;
    221 
    222             if (!xs_match(type, POSTLIKE_OBJECT_TYPE))
    223                 continue;
    224 
    225             if (timeline_here(user, id)) {
    226                 snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id));
    227                 continue;
    228             }
    229 
    230             enqueue_actor_refresh(user, attr_to, 0);
    231 
    232             timeline_add(user, id, obj);
    233 
    234             snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id));
    235         }
    236     }
    237 
    238     /* update the RSS metadata */
    239     etag = xs_dict_get(rsp, "etag");
    240 
    241     if (xs_is_string(etag)) {
    242         rss_md = xs_dict_set(rss_md, "etag", etag);
    243         rss_md = xs_dict_set(rss_md, "url", url);
    244         if ((f = fopen(rss_md_fn, "w")) != NULL) {
    245             xs_json_dump(rss_md, 4, f);
    246             fclose(f);
    247         }
    248     }
    249 }
    250 
    251 
    252 void rss_poll_hashtags(void)
    253 /* parses all RSS from all users */
    254 {
    255     xs *list = user_list();
    256     const char *uid;
    257 
    258     xs_list_foreach(list, uid) {
    259         snac user;
    260 
    261         if (user_open(&user, uid)) {
    262             const xs_list *rss = xs_dict_get(user.config, "followed_hashtags");
    263 
    264             if (xs_is_list(rss)) {
    265                 const char *url;
    266 
    267                 xs_list_foreach(rss, url)
    268                     rss_to_timeline(&user, url);
    269             }
    270 
    271             user_free(&user);
    272         }
    273     }
    274 }