From: self Date: Mon, 31 Jul 2023 22:23:13 +0000 (-0700) Subject: Initial commit X-Git-Url: http://these/git/%7Bthis.props.src%7D?a=commitdiff_plain;h=262b488c58c37aa5428a465e1847609656c94236;p=sneer-archive-data.git Initial commit --- 262b488c58c37aa5428a465e1847609656c94236 diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..a4f3544 --- /dev/null +++ b/.envrc @@ -0,0 +1,7 @@ +if ! has nix_direnv_version || ! nix_direnv_version 2.3.0; then + source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/2.3.0/direnvrc" "sha256-Dmd+j63L84wuzgyjITIfSxSD57Tx7v51DMxVZOsiUD8=" +fi +use flake + +source_env_if_exists .envrc.private +watch_file template.nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bda639a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.idea +*.log +tmp/ + +result +.envrc.private +.direnv/ diff --git a/SneerClub_comments.jsonl.zst b/SneerClub_comments.jsonl.zst new file mode 100644 index 0000000..1154538 Binary files /dev/null and b/SneerClub_comments.jsonl.zst differ diff --git a/SneerClub_submissions.jsonl.zst b/SneerClub_submissions.jsonl.zst new file mode 100644 index 0000000..58f4880 Binary files /dev/null and b/SneerClub_submissions.jsonl.zst differ diff --git a/bdfr.tar.zst b/bdfr.tar.zst new file mode 100644 index 0000000..a3fa14f Binary files /dev/null and b/bdfr.tar.zst differ diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3dfe0e6 --- /dev/null +++ b/flake.lock @@ -0,0 +1,59 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1689068808, + "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1689078114, + "narHash": "sha256-osG8BrX5RpKJ7wH+vI6auOU+ctvNOblT4XXCgknK47c=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "b6cc7ff8fee93789bc871a267ab876c3fca042cb", + "type": "github" + }, + "original": { + "id": "nixpkgs", + "type": "indirect" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..cf6708b --- /dev/null +++ b/flake.nix @@ -0,0 +1,60 @@ +{ + description = "r/SneerClub's contents preserved as JSON files"; + + inputs = { flake-utils.url = "github:numtide/flake-utils"; }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages."${system}"; + process-markdown = pkgs.writeText "process-markdown.mlr" '' + subr process_replies(r) { + for (i, v in $r) { + b = r[i]["body"]; + md = system("${pkgs.pandoc}/bin/pandoc -f markdown <<__EOF__\n".$r[i]["body"]."\n__EOF__"); + utc = $r[i]["created_utc"]; + if (!is_error(md)) { + $r[i]["body"] = md; + } + $r[i]["created_date"] = system("date -d @".utc." -u +'%B %d, %Y %I:%M %p'"); + + call process_replies($r[i].replies); + } + } + $selftext = system("${pkgs.pandoc}/bin/pandoc -f markdown <<__EOF__\n".$selftext."\n__EOF__"); + $created_date = system("date -d @".$created_utc." -u +'%B %d, %Y %I:%M %p'"); + for (i, v in $comments) { + b = $comments[i].body; + md = system("${pkgs.pandoc}/bin/pandoc -f markdown <<__EOF__\n".b."\n__EOF__"); + utc = $comments[i].created_utc; + if (!is_error(md)) { + $comments[i].body = md; + } + $comments[i].created_date = system("date -d @".utc." -u +'%B %d, %Y %I:%M %p'"); + + call process_replies($comments[i]["replies"]); + } + ''; + in { + packages.json-threads = pkgs.runCommand "process-json" { } '' + shopt -s globstar + mkdir -p $out + tar -I ${pkgs.zstd}/bin/zstd -xf ${./bdfr.tar.zst} + ${pkgs.miller}/bin/mlr --json \ + put -f ${process-markdown} then \ + sort -nr created_utc \ + ./bdfr/**/*.json > $out/threads-newest.json + ${pkgs.miller}/bin/mlr --json \ + cut -x -f comments,selftext then \ + sort -nr score \ + $out/threads-newest.json > $out/submissions-bestest.json + ${pkgs.miller}/bin/mlr --json \ + cut -x -f comments,selftext then \ + sort -nr num_comments \ + $out/threads-newest.json > $out/submissions-longest.json + ''; + + packages.default = self.packages."${system}".json-threads; + + }); +}