rss.c (8082B)
1 /* snac - A simple, minimalistic ActivityPub instance */ 2 /* copyright (c) 2025 grunfink et al. / MIT license */ 3 4 #include "xs.h" 5 #include "xs_html.h" 6 #include "xs_regex.h" 7 #include "xs_time.h" 8 #include "xs_match.h" 9 #include "xs_curl.h" 10 #include "xs_openssl.h" 11 #include "xs_json.h" 12 13 #include "snac.h" 14 15 xs_str *rss_from_timeline(snac *user, const xs_list *timeline, 16 const char *title, const char *link, const char *desc) 17 /* converts a timeline to rss */ 18 { 19 xs_html *rss = xs_html_tag("rss", 20 xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"), 21 xs_html_attr("version", "2.0"), 22 xs_html_attr("xmlns:atom", "http:/" "/www.w3.org/2005/Atom")); 23 24 xs_html *channel = xs_html_tag("channel", 25 xs_html_tag("title", 26 xs_html_text(title)), 27 xs_html_tag("language", 28 xs_html_text("en")), 29 xs_html_tag("link", 30 xs_html_text(link)), 31 xs_html_sctag("atom:link", 32 xs_html_attr("href", link), 33 xs_html_attr("rel", "self"), 34 xs_html_attr("type", "application/rss+xml")), 35 xs_html_tag("generator", 36 xs_html_text(USER_AGENT)), 37 xs_html_tag("description", 38 xs_html_text(desc))); 39 40 xs_html_add(rss, channel); 41 42 int cnt = 0; 43 const char *v; 44 45 xs_list_foreach(timeline, v) { 46 xs *msg = NULL; 47 48 if (user) { 49 if (!valid_status(timeline_get_by_md5(user, v, &msg))) 50 continue; 51 } 52 else { 53 if (!valid_status(object_get_by_md5(v, &msg))) 54 continue; 55 } 56 57 const char *id = xs_dict_get(msg, "id"); 58 const char *content = xs_dict_get(msg, "content"); 59 const char *published = xs_dict_get(msg, "published"); 60 61 if (user && !xs_startswith(id, user->actor)) 62 continue; 63 64 if (!id || !content || !published) 65 continue; 66 67 /* create a title with the first line of the content */ 68 xs *title = xs_replace(content, "<br>", "\n"); 69 title = xs_regex_replace_i(title, "<[^>]+>", " "); 70 title = xs_regex_replace_i(title, "&[^;]+;", " "); 71 int i; 72 73 for (i = 0; title[i] && title[i] != '\n' && i < 50; i++); 74 75 if (title[i] != '\0') { 76 title[i] = '\0'; 77 title = xs_str_cat(title, "..."); 78 } 79 80 title = xs_strip_i(title); 81 82 /* convert the date */ 83 time_t t = xs_parse_iso_date(published, 0); 84 xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000"); 85 86 /* if it's the first one, add it to the header */ 87 if (cnt == 0) 88 xs_html_add(channel, 89 xs_html_tag("lastBuildDate", 90 xs_html_text(rss_date))); 91 92 xs_html_add(channel, 93 xs_html_tag("item", 94 xs_html_tag("title", 95 xs_html_text(title)), 96 xs_html_tag("link", 97 xs_html_text(id)), 98 xs_html_tag("guid", 99 xs_html_text(id)), 100 xs_html_tag("pubDate", 101 xs_html_text(rss_date)), 102 xs_html_tag("description", 103 xs_html_text(content)))); 104 105 cnt++; 106 } 107 108 return xs_html_render_s(rss, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); 109 } 110 111 112 void rss_to_timeline(snac *user, const char *url) 113 /* reads an RSS and inserts all ActivityPub posts into the user's timeline */ 114 { 115 if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/")) 116 return; 117 118 xs *hdrs = xs_dict_new(); 119 hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml"); 120 hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT); 121 122 /* get the RSS metadata */ 123 xs *md5 = xs_md5_hex(url, strlen(url)); 124 xs *rss_md_fn = xs_fmt("%s/rss", user->basedir); 125 mkdirx(rss_md_fn); 126 rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json"); 127 128 xs *rss_md = NULL; 129 const char *etag = NULL; 130 131 FILE *f; 132 if ((f = fopen(rss_md_fn, "r")) != NULL) { 133 rss_md = xs_json_load(f); 134 fclose(f); 135 136 etag = xs_dict_get(rss_md, "etag"); 137 138 if (xs_is_string(etag)) 139 hdrs = xs_dict_set(hdrs, "if-none-match", etag); 140 } 141 142 if (rss_md == NULL) 143 rss_md = xs_dict_new(); 144 145 xs *payload = NULL; 146 int status; 147 int p_size; 148 149 xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0); 150 151 snac_log(user, xs_fmt("parsing RSS %s %d", url, status)); 152 153 if (!valid_status(status) || !xs_is_string(payload)) 154 return; 155 156 /* not an RSS? done */ 157 const char *ctype = xs_dict_get(rsp, "content-type"); 158 if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1) 159 return; 160 161 /* yes, parsing is done with regexes (now I have two problems blah blah blah) */ 162 xs *links = xs_regex_select(payload, "<link>[^<]+</link>"); 163 const char *link; 164 165 xs_list_foreach(links, link) { 166 xs *l = xs_replace(link, "<link>", ""); 167 char *p = strchr(l, '<'); 168 169 if (p == NULL) 170 continue; 171 *p = '\0'; 172 173 /* skip this same URL */ 174 if (strcmp(l, url) == 0) 175 continue; 176 177 /* skip crap */ 178 if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/")) 179 continue; 180 181 snac_debug(user, 1, xs_fmt("RSS link: %s", l)); 182 183 if (timeline_here(user, l)) { 184 snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l)); 185 continue; 186 } 187 188 /* special trick for Mastodon: convert from the alternate format */ 189 if (strchr(l, '@') != NULL) { 190 xs *l2 = xs_split(l, "/"); 191 192 if (xs_list_len(l2) == 5) { 193 const char *uid = xs_list_get(l2, 3); 194 if (*uid == '@') { 195 xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s", 196 xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1)); 197 198 if (timeline_here(user, guessed_id)) { 199 snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id)); 200 continue; 201 } 202 } 203 } 204 } 205 206 xs *obj = NULL; 207 208 if (!valid_status(object_get(l, &obj))) { 209 /* object is not here: bring it */ 210 if (!valid_status(activitypub_request(user, l, &obj))) 211 continue; 212 } 213 214 if (xs_is_dict(obj)) { 215 const char *id = xs_dict_get(obj, "id"); 216 const char *type = xs_dict_get(obj, "type"); 217 const char *attr_to = get_atto(obj); 218 219 if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to)) 220 continue; 221 222 if (!xs_match(type, POSTLIKE_OBJECT_TYPE)) 223 continue; 224 225 if (timeline_here(user, id)) { 226 snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id)); 227 continue; 228 } 229 230 enqueue_actor_refresh(user, attr_to, 0); 231 232 timeline_add(user, id, obj); 233 234 snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id)); 235 } 236 } 237 238 /* update the RSS metadata */ 239 etag = xs_dict_get(rsp, "etag"); 240 241 if (xs_is_string(etag)) { 242 rss_md = xs_dict_set(rss_md, "etag", etag); 243 rss_md = xs_dict_set(rss_md, "url", url); 244 if ((f = fopen(rss_md_fn, "w")) != NULL) { 245 xs_json_dump(rss_md, 4, f); 246 fclose(f); 247 } 248 } 249 } 250 251 252 void rss_poll_hashtags(void) 253 /* parses all RSS from all users */ 254 { 255 xs *list = user_list(); 256 const char *uid; 257 258 xs_list_foreach(list, uid) { 259 snac user; 260 261 if (user_open(&user, uid)) { 262 const xs_list *rss = xs_dict_get(user.config, "followed_hashtags"); 263 264 if (xs_is_list(rss)) { 265 const char *url; 266 267 xs_list_foreach(rss, url) 268 rss_to_timeline(&user, url); 269 } 270 271 user_free(&user); 272 } 273 } 274 }