From 7504f5438a0a8d196521dc7b7de0015e88548733 Mon Sep 17 00:00:00 2001 From: Gabriel Garrido Date: Sun, 19 May 2024 22:17:10 +0200 Subject: [PATCH] Add missing api params, update README --- README.md | 118 +++++++++++++++++++++++++++-------------------- client/client.go | 9 ---- client/post.go | 24 ++++++++++ main.go | 20 +++++--- 4 files changed, 104 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index a6e903f..8d39368 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,21 @@ # Mastodon markdown archive -Fetch a Mastodon account's posts and save them as markdown files. Post content is converted to markdown, images are downloaded and inlined, and replies are threaded. A post whose visibility is not `public` is skipped, and the post's id is used as the filename. +Fetch a Mastodon account's posts and save them as text files using Mastodon's [statuses API](https://docs.joinmastodon.org/methods/accounts/#statuses). -Implements most of the parameters in Mastodon's public [API to get an account's statuses](https://docs.joinmastodon.org/methods/accounts/#statuses). +This program essentially wraps the Mastodon API with a command line interface with some additional features. -I use this tool to create an [archive of my Mastodon posts](https://garrido.io/microblog/), which I then syndicate to my own site following [PESOS](https://indieweb.org/PESOS). +**Features** +- Supports all parameters in Mastodon's statuses API +- Convert post to markdown +- Customize output file location, name, and extension +- Customize output format and front matter +- Optionally download of post media +- Optionally threading of posts +- Optionally filter based on post visibility +- Optional affordances for scripting +- Optionally persist fetched post id cursors + +I use this tool to create an archive of my Mastodon posts and [syndicate them to my own site](https://garrido.io/microblog/), per IndieWeb's [PESOS philosophy](https://indieweb.org/PESOS). ## Install @@ -18,27 +29,33 @@ Usage of mastodon-markdown-archive: -dist string Path to directory where files will be written (default "./posts") -download-media string - Download media in a post. Omit or pass an empty string to not download media. Pass 'bundle' to download the media inline in a single directory with its original post. Pass a path to a directory to download all media there. + Path where post attachments will be downloaded. Omit to skip downloading attachments. -exclude-reblogs - Exclude reblogs + Mastodon API parameter: Filter out boosts from the response -exclude-replies - Exclude replies to other users + Mastodon API parameter: Filter out statuses in reply to a different account -filename string Template for post filename -limit int - Maximum number of posts to fetch (default 40) + Mastodon API parameter: Maximum number of results to return. Defaults to 20 statuses. Max 40 statuses (default 40) -max-id string - Fetch posts older than this id + Mastodon API parameter: All results returned will be lesser than this ID. In effect, sets an upper bound on results. -min-id string - Fetch posts newer than this id + Mastodon API parameter: Returns results immediately newer than this ID. In effect, sets a cursor at this ID and paginates forward. + -only-media + Mastodon API parameter: Filter out status without attachments -persist-first string Location to persist the post id of the first post returned -persist-last string Location to persist the post id of the last post returned + -pinned + Mastodon API parameter: Filter for pinned statuses only -porcelain Prints the amount of fetched posts to stdout in a parsable manner -since-id string - Fetch posts greater than this id + Mastodon API parameter: All results returned will be greater than this ID. In effect, sets a lower bound on results. + -tagged string + Mastodon API parameter: Filter for statuses using a specific hashtag -template string Template to use for post rendering, if passed -threaded @@ -51,9 +68,7 @@ Usage of mastodon-markdown-archive: ## Example -Here is how I use this to archive posts from my Mastodon account. - -I use this tool programatically, and I do not want to recreate the archive from scratch each time. I exclude replies to others, and reblogs. +I use this tool programatically, and I do not want to recreate the archive from scratch each time. I thread posts, exclude replies to others, exclude reblogs, and filter out any post that is not public. I first used this to generate an archive of all the posts that I had published to date. Then, I run it programatically to archive any new posts made. @@ -68,14 +83,15 @@ mastodon-markdown-archive \ --exclude-replies \ --exclude-reblogs \ --persist-last=./last \ +--visibility=public \ +--download-media=bundle \ +--threaded=true \ --max-id=$(test -f ./last && cat ./last || echo "") ``` -Calling this for the first time will fetch the most recent 40 posts. With `--persist-last./last`, the oldest fetched post id will be saved at `./last`. +Calling this for the first time will fetch the most recent 40 posts. With `--persist-last./last`, the oldest fetched post id will be saved at `./last`. Caling this command again will set the `last` cursor to the oldest post of the next 40 posts, and so on. -Calling this command iteratively will fetch the account's posts in reverse chronological order, 40 posts at a time. - -You can use simple bash script to automate this process. Adding the `--porcelain` flag prints the amount of fetched posts to stdout, which can then be used continue or stop fetching posts: +You can use a simple bash script to automate this process. Adding the `--porcelain` flag prints the amount of fetched posts to stdout, which can then be used to continue or stop fetching posts: ```bash #!/bin/bash @@ -86,6 +102,8 @@ while true; do --exclude-reblogs=true \ --user=https://social.coop/@ggpsv \ --porcelain=true \ + --visibility=public \ + --download-media=bundle \ --threaded=true \ --persist-last=./last \ --max-id=$(test -f ./last && cat ./last || echo '')" @@ -102,16 +120,21 @@ done ### Getting the latest posts +Having created the entire archive, I now want to run this on a schedule to retrieve only the latest posts. + With `--persist-first=./first`, the most recent post id will be saved at `./first`. -Calling this command iteratively will only fetch posts that have been made since the last retrieved post: +Calling this command iteratively will only fetch posts that have been made since then. ```sh mastodon-markdown-archive \ --user=https://social.coop/@ggpsv \ --dist=./posts \ ---exclude-replies \ ---exclude-reblogs \ +--exclude-replies=true \ +--exclude-reblogs=true \ +--visibility=public \ +--download-media=bundle \ +--threaded=true \ --persist-first=./first \ --since-id=$(test -f ./first && cat ./first || echo "") ``` @@ -120,30 +143,34 @@ mastodon-markdown-archive \ By default, posts by the author in reply to another post by the author will be written out as separate files. -However, posts can be threaded together using the `--threaded=true` flag. With threading, the descendants of a post will not be written out as a separate files. Instead, only the top post will be written out. +Alternatively, posts can be threaded together using the `--threaded=true` flag. With threading, the descendants of a post will not be written out as a separate files. Instead, only the top post will be written out. -The program will aggregate the post's descendants in reverse chronological order and make them available in the template via the [Descendants](https://pkg.go.dev/git.garrido.io/gabriel/mastodon-markdown-archive/client#Post.Descendants) method. This can be used in [templates](#templating) to render threaded posts as a single post, which the default template does. +The program will aggregate the post's descendants in reverse chronological order and make them available in the template via the [Descendants](https://pkg.go.dev/git.garrido.io/gabriel/mastodon-markdown-archive/client#Post.Descendants) method. This can be used in [templates](#templating) to render threaded posts as a single post, which the [default template does](./files/templates/post.tmpl#L33). When threading, the `AllMedia` and `AllTags` methods will yield the aggregated [MediaAttachment](https://pkg.go.dev/git.garrido.io/gabriel/mastodon-markdown-archive/client#MediaAttachment) and [Tag](https://pkg.go.dev/git.garrido.io/gabriel/mastodon-markdown-archive/client#Tag), respectively. +When the `--visibility` flag is used, only the top post's visibility is evaluated. This is done explicitly to support the common practice in Mastodon of setting threaded replies as `unlisted`. + ### Orphaned posts Mastodon limits their statuses API to a maximum 40 posts at a time, and the `--limit` flag can be used to limit this further. -Because of this limit, it is possible that posts in a thread end up split across different responses. Or, a user may maintain a long-lived thread of posts that gets updated sporadically. This results in an orphaned post, which is a post whose parent is not within the same batch of posts returned by a single API call. +Because of this limit, it is possible that posts in a thread end up split across different responses. Or, a user may maintain a long-lived thread of posts that gets updated sporadically and thus rarely will a single batch of posts have all the descendants of the post. + +An orphaned post is a post whose parent is not within a batch of posts returned by a single API call. In either case, the program will fallback to using the [status context](https://docs.joinmastodon.org/methods/statuses/#context) endpoint to rebuild the corresponding thread from the top. ## Templating -The contents of the file and the filename for each post can be customized using templates. This provides enough flexibility to use this tool for various purposes. The templates are evaluated as Go [text templates](https://pkg.go.dev/text/template), so it should be possible to do anything that's supported in a Go template. +The contents of the file and the filename for each post can be customized using templates. This provides enough flexibility to use this tool for various purposes. The templates are evaluated as Go [text templates](https://pkg.go.dev/text/template), so it should be possible to do anything that's normally supported in a Go template. -For example, if you're using this to syndicate posts to a site built using a static site generator, you can customize the output so that it adheres to specific requirements around front-matter structure or filename formats. +For example, if you're using this to syndicate posts to a site built using a static site generator, you can customize the output so that it adheres to specific requirements around front matter structure or filename formats. ### Post -Out of the box, this tool uses the [post.tmpl](./files/templates/post.tmpl) template to create the post file. It converts the post content to markdown, threads replies, and defines some attributes in the front-matter using YAML. +Out of the box, this tool uses the [post.tmpl](./files/templates/post.tmpl) template to create the post file. It converts the post content to markdown, threads replies, and defines some attributes in the front matter using YAML. -For example, for this [post](https://social.coop/@ggpsv/112326240503555949): +For example, this [post](https://social.coop/@ggpsv/112326240503555949) is converted to this markdown file: ```md --- @@ -165,7 +192,7 @@ Also, KDE Plasma 6 looks incredibly crisp on this screen. A different template can be used by passing its path to `--template`. The template must comply with Go template syntax. -For example, a `jekyll.tmpl` template with customized front-matter : +For example, a `jekyll.tmpl` template with customized front matter : ``` --- @@ -177,7 +204,7 @@ published: true {{ .Post.Content | toMarkdown }} ``` -Passed to the command as `--template=./jekyll.tmpl` will yield a file that looks like this: +Passed to the command as `--template=./jekyll.tmpl` will instead yield a file that looks like this: ```md --- @@ -195,7 +222,7 @@ I simply decrypted my drive, shrunk it, created a partition, booted off a USB ke Also, KDE Plasma 6 looks incredibly crisp on this screen. ``` -You might even want to use HTML as the output and thus have a `html.tmpl` file: +You might even want to use HTML as the output and thus pass a `--template=./html.tmpl` flag for a `html.tmpl` template that looks like this: ```html @@ -211,31 +238,16 @@ You might even want to use HTML as the output and thus have a `html.tmpl` file: ``` -Passed to the command as `--template=./html.tmpl` will yield a file that looks like this: -```html - - - - - - 112326240503555949 - - -

Back at dual-booting on the . Last time it was Ubuntu, but now I have gone with 40 KDE.

I'm impressed with how things just work with this laptop. Major props to the @frameworkcomputer team for supporting these distros out of the box.

I simply decrypted my drive, shrunk it, created a partition, booted off a USB key, installed Fedora, encrypted both partitions, and that's it.

Also, KDE Plasma 6 looks incredibly crisp on this screen.

- - -``` - ### Filename -Out of the box, this tool uses `.md` as the post filename format. For example, this [post](https://social.coop/@ggpsv/112326240503555949) is saved `112326240503555949.md` +Out of the box, this tool uses the post's id and the `.md` extension for the filename. For example, this [post](https://social.coop/@ggpsv/112326240503555949) is saved `112326240503555949.md` A different format for the filename can be used by passing a template string to `--filename`. The string must comply with Go template syntax. For example, to create post files that are prefixed with the post's creation date in `YYYY-MM-DD` format and suffixed with the post id, pass `--filename='{{.Post.CreatedAt | date "2006-01-02"}}-{{.Post.Id}}.md`. -An extension suffixed to the filename template will be used if present. Otherwise, `.md` is used as the default file extension. +An extension in the filename template will be used if present. Otherwise, `.md` is used as the default file extension. -Following the HTML example in the [post template section](#post) above, you format the filename as `--filename='{{.Post.Id}}.html'` to use HTML as the extension. +Following the HTML example in the [post template section](#post) above, you may customize the filename as `--filename='{{.Post.Id}}.html'` to use HTML as the output file extension. ### Available functions and variables @@ -258,8 +270,12 @@ For example, `--download-media=./images` saves any media to the `./images`. Once downloaded, the media's path is available in [MediaAttachment.Path](https://pkg.go.dev/git.garrido.io/gabriel/mastodon-markdown-archive/client#MediaAttachment) as an absolute path. -Sprig's [path](https://masterminds.github.io/sprig/paths.html) functions can be used in the templates to manipulate the path as necessary. For example, the default template uses `osBase` to get the last element of the filepath. +Sprig's [path](https://masterminds.github.io/sprig/paths.html) functions can be used in the templates to manipulate the path as necessary. For example, the [default template](https://git.hq.ggpsv.com/gabriel/mastodon-markdown-archive/src/branch/main/files/templates/post.tmpl#L25-L27) uses `osBase` to get the last element of the filepath. -You can use `--download-media=bundle` to save the post media in a single directory with its original post. In this case, the post's filename will be used as the directory name and the post filename will be `index.{extension}`. This is done specifically to support Hugo [page bundles](https://gohugo.io/content-management/page-bundles). +### Bundling -For example, `--download-media="./bundle" --filename='{{ .Post.CreatedAt | date "2006-01-02" }}-{{.Post.Id}}.md'` will create a `YYYY-MM-DD-/` directory, with the post saved as `YYYY-MM-DD-/index.md` and media saved as `YYYY-MM-DD-/.`. \ No newline at end of file +You can use `--download-media=bundle` to save the post media in a single directory with its original post. In this case, the post's filename will be used as the directory name and the post filename will be `index.{extension}`. + +For example, `--download-media="./bundle" --filename='{{ .Post.CreatedAt | date "2006-01-02" }}-{{.Post.Id}}.md'` will create a `YYYY-MM-DD-/` directory, with the post saved as `YYYY-MM-DD-/index.md` and media saved as `YYYY-MM-DD-/.`. + +This is done specifically to support Hugo [page bundles](https://gohugo.io/content-management/page-bundles). diff --git a/client/client.go b/client/client.go index 9b1fd9b..c504707 100644 --- a/client/client.go +++ b/client/client.go @@ -28,15 +28,6 @@ type Client struct { options ClientOptions } -type PostsFilter struct { - ExcludeReplies bool - ExcludeReblogs bool - Limit int - SinceId string - MinId string - MaxId string -} - func New(userURL string, filters PostsFilter, opts ClientOptions) (Client, error) { var client Client parsedURL, err := url.Parse(userURL) diff --git a/client/post.go b/client/post.go index c3fac0a..e738341 100644 --- a/client/post.go +++ b/client/post.go @@ -57,6 +57,18 @@ type Post struct { descendants []*Post } +type PostsFilter struct { + ExcludeReplies bool + ExcludeReblogs bool + Limit int + SinceId string + MinId string + MaxId string + OnlyMedia bool + Pinned bool + Tagged string +} + func (p Post) ShouldSkip(visibility string) bool { if visibility == "" { return false @@ -126,6 +138,18 @@ func FetchPosts(baseURL string, accountId string, filters PostsFilter) ([]Post, queryValues.Add("min_id", filters.MinId) } + if filters.Tagged != "" { + queryValues.Add("tagged", filters.Tagged) + } + + if filters.OnlyMedia { + queryValues.Add("only_media", strconv.Itoa(1)) + } + + if filters.Pinned { + queryValues.Add("pinned", strconv.Itoa(1)) + } + queryValues.Add("limit", strconv.Itoa(filters.Limit)) query := fmt.Sprintf("?%s", queryValues.Encode()) diff --git a/main.go b/main.go index 01b6a89..730e8c9 100644 --- a/main.go +++ b/main.go @@ -14,19 +14,22 @@ import ( func main() { dist := flag.String("dist", "./posts", "Path to directory where files will be written") user := flag.String("user", "", "URL of Mastodon account whose toots will be fetched") - excludeReplies := flag.Bool("exclude-replies", false, "Exclude replies to other users") - excludeReblogs := flag.Bool("exclude-reblogs", false, "Exclude reblogs") - limit := flag.Int("limit", 40, "Maximum number of posts to fetch") - sinceId := flag.String("since-id", "", "Fetch posts greater than this id") - maxId := flag.String("max-id", "", "Fetch posts older than this id") - minId := flag.String("min-id", "", "Fetch posts newer than this id") + excludeReplies := flag.Bool("exclude-replies", false, "Mastodon API parameter: Filter out statuses in reply to a different account") + excludeReblogs := flag.Bool("exclude-reblogs", false, "Mastodon API parameter: Filter out boosts from the response") + limit := flag.Int("limit", 40, "Mastodon API parameter: Maximum number of results to return. Defaults to 20 statuses. Max 40 statuses") + onlyMedia := flag.Bool("only-media", false, "Mastodon API parameter: Filter out status without attachments") + pinned := flag.Bool("pinned", false, "Mastodon API parameter: Filter for pinned statuses only") + sinceId := flag.String("since-id", "", "Mastodon API parameter: All results returned will be greater than this ID. In effect, sets a lower bound on results.") + maxId := flag.String("max-id", "", "Mastodon API parameter: All results returned will be lesser than this ID. In effect, sets an upper bound on results.") + minId := flag.String("min-id", "", "Mastodon API parameter: Returns results immediately newer than this ID. In effect, sets a cursor at this ID and paginates forward.") + tagged := flag.String("tagged", "", "Mastodon API parameter: Filter for statuses using a specific hashtag") persistFirst := flag.String("persist-first", "", "Location to persist the post id of the first post returned") persistLast := flag.String("persist-last", "", "Location to persist the post id of the last post returned") templateFile := flag.String("template", "", "Template to use for post rendering, if passed") threaded := flag.Bool("threaded", false, "Thread replies for a post in a single file") filenameTemplate := flag.String("filename", "", "Template for post filename") porcelain := flag.Bool("porcelain", false, "Prints the amount of fetched posts to stdout in a parsable manner") - downloadMedia := flag.String("download-media", "", "Download media in a post. Omit or pass an empty string to not download media. Pass 'bundle' to download the media inline in a single directory with its original post. Pass a path to a directory to download all media there.") + downloadMedia := flag.String("download-media", "", "Path where post attachments will be downloaded. Omit to skip downloading attachments.") visibility := flag.String("visibility", "", "Filter out posts whose visibility does not match the passed visibility value") flag.Parse() @@ -38,6 +41,9 @@ func main() { SinceId: *sinceId, MaxId: *maxId, MinId: *minId, + OnlyMedia: *onlyMedia, + Pinned: *pinned, + Tagged: *tagged, }, client.ClientOptions{ Threaded: *threaded, Visibility: *visibility,