py/tumblrcap: init

2023-01-17 19:36:29 +00:00 · 2023-01-17 19:36:29 +00:00 · e85f6fc6ce
commit e85f6fc6ce
parent 8731a6a37f
6 changed files with 441 additions and 0 deletions
--- a/.hgignore
+++ b/.hgignore
@ -12,6 +12,8 @@ rust/passgen/target/
 web/quotes/theme/static/
 py/tumblrcap/dl/
 syntax: regexp
 ^go/trains/.*/start.sh$
 ^go/trains/.*/lukegb-trains.json$
--- a/py/default.nix
+++ b/py/default.nix
@ -5,4 +5,5 @@
 args: {
  valveindexinstock = import ./valveindexinstock args;
  icalfilter = import ./icalfilter args;
  tumblrcap = import ./tumblrcap args;
 }
--- a/py/tumblrcap/init.py
+++ b/py/tumblrcap/init.py
--- a/py/tumblrcap/main.py
+++ b/py/tumblrcap/main.py
@ -0,0 +1,350 @@
 import copy
 import functools
 import hashlib
 import json
 import os
 import shutil
 from pathlib import Path
 from typing import Dict, List, Optional
 import attrs
 import bs4
 import pytumblr2
 import requests
 import requests_oauthlib
 from absl import app
 TUMBLR_REQUEST_TOKEN_URL = "https://www.tumblr.com/oauth/request_token"
 TUMBLR_AUTHORIZE_URL = "https://www.tumblr.com/oauth/authorize"
 TUMBLR_ACCESS_TOKEN_URL = "https://www.tumblr.com/oauth/access_token"
@attrs.frozen
 class Credentials:
    consumer_key: str
    consumer_secret: str
    oauth_token: Optional[str] = attrs.field(default=None)
    oauth_token_secret: Optional[str] = attrs.field(default=None)
    @staticmethod
    def _path():
        return Path.home() / ".config" / "tumblrcap" / "credentials.json"
    @classmethod
    def load(cls):
        try:
            with open(cls._path(), "rt") as f:
                loaded = json.load(f)
                return cls(**loaded)
        except FileNotFoundError:
            raise
    def save(self):
        with open(self._path(), "wt") as f:
            f.write(json.dumps(attrs.asdict(self)))
    def with_token(self, oauth_token, oauth_token_secret):
        return attrs.evolve(
            self, oauth_token=oauth_token, oauth_token_secret=oauth_token_secret
        )
 def make_tumblr_client() -> pytumblr2.TumblrRestClient:
    try:
        creds = Credentials.load()
    except FileNotFoundError:
        consumer_key = input("Consumer key: ").strip()
        consumer_secret = input("Consumer secret: ").strip()
        creds = Credentials(consumer_key=consumer_key, consumer_secret=consumer_secret)
        creds.save()
    if not creds.oauth_token:
        oauth_session = requests_oauthlib.OAuth1Session(
            creds.consumer_key, client_secret=creds.consumer_secret
        )
        fetch_response = oauth_session.fetch_request_token(TUMBLR_REQUEST_TOKEN_URL)
        resource_owner_key = fetch_response.get("oauth_token")
        resource_owner_secret = fetch_response.get("oauth_token_secret")
        full_authorize_url = oauth_session.authorization_url(TUMBLR_AUTHORIZE_URL)
        print("\nAuthorize: {}".format(full_authorize_url))
        redirect_response = input("Full redirect URL here:\n").strip()
        oauth_response = oauth_session.parse_authorization_response(redirect_response)
        verifier = oauth_response.get("oauth_verifier")
        oauth_session = OAuth1Session(
            creds.consumer_key,
            client_secret=creds.consumer_secret,
            resource_owner_key=resource_owner_key,
            resource_owner_secret=resource_owner_secret,
            verifier=verifier,
        )
        oauth_tokens = oauth_session.fetch_access_token(TUMBLR_ACCESS_TOKEN_URL)
        creds = creds.with_token(
            oauth_tokens.get("oauth_token"), oauth_tokens.get("oauth_token_secret")
        )
        creds.save()
    return pytumblr2.TumblrRestClient(
        creds.consumer_key,
        creds.consumer_secret,
        creds.oauth_token,
        creds.oauth_token_secret,
    )
 def all_likes(client):
    kwargs = {}
    while True:
        this_likes = client.likes(**kwargs)
        if "_links" in this_likes and "next" in this_likes["_links"]:
            kwargs = this_likes["_links"]["next"]["query_params"]
        else:
            break
        for post in this_likes.get("liked_posts", []):
            yield pytumblr2.simulate_legacy_payload(post)
 def all_posts(client, blogname):
    kwargs = {}
    while True:
        kwargs['blogname'] = blogname
        this_posts = client.posts(**kwargs)
        if "_links" in this_posts and "next" in this_posts["_links"]:
            kwargs = this_posts["_links"]["next"]["query_params"]
        else:
            break
        for post in this_posts.get("posts", []):
            yield pytumblr2.simulate_legacy_payload(post)
 def dump_likes(client: pytumblr2.TumblrRestClient, out_path: Path) -> None:
    out_path.mkdir(parents=True, exist_ok=True)
    order = []
    for post in all_likes(client):
        post_path = out_path / ("{}.json".format(post["id_string"]))
        with open(post_path, "wt") as f:
            json.dump(post, f)
            print(post_path)
        order.append(post["id_string"])
    with open(out_path / "_order.json", "wt") as f:
        json.dump(order, f)
 def dump_blog(client: pytumblr2.TumblrRestClient, blog_id: str, out_path: Path) -> None:
    out_path.mkdir(parents=True, exist_ok=True)
    order = []
    for post in all_posts(client, blog_id):
        post_path = out_path / ("{}.json".format(post["id_string"]))
        with open(post_path, "wt") as f:
            json.dump(post, f)
            print(post_path)
        order.append(post["id_string"])
    with open(out_path / "_order.json", "wt") as f:
        json.dump(order, f)
 def fetch_image(
    sess: requests.Session, known_images: Dict[str, str], output_dir: Path, src: str
 ) -> str:
    if src in known_images:
        return known_images[src]
    target_filename_hash = hashlib.sha256()
    target_filename_hash.update(src.encode("utf8"))
    src_suffix = src
    if "." not in src_suffix:
        src_suffix = ".raw"
    else:
        src_suffix = src_suffix[src_suffix.rfind(".") :]
    target_filename = "{}{}".format(target_filename_hash.hexdigest(), src_suffix)
    target_path = output_dir / target_filename[0:2] / target_filename[2:4]
    target_path.mkdir(parents=True, exist_ok=True)
    target_path = target_path / target_filename
    if not target_path.is_file():
        with sess.get(src, stream=True) as r:
            r.raw.read = functools.partial(r.raw.read, decode_content=True)
            with open(target_path, "wb") as f:
                shutil.copyfileobj(r.raw, f)
    else:
        print(target_path, "already exists")
    final_src = "_images/{}/{}/{}".format(
        target_filename[0:2], target_filename[2:4], target_filename
    )
    known_images[src] = final_src
    return final_src
 def mutate_attr(d, attr_path, f):
    if len(attr_path) == 1:
        if attr_path[0] == "*":
            if isinstance(d, list):
                for n in range(len(d)):
                    d[n] = f(d[n])
        else:
            if attr_path[0] in d:
                d[attr_path[0]] = f(d[attr_path[0]])
        return
    if attr_path[0] == "*":
        if isinstance(d, list):
            for n in range(len(d)):
                mutate_attr(d[n], attr_path[1:], f)
    else:
        if attr_path[0] not in d:
            return
        mutate_attr(d[attr_path[0]], attr_path[1:], f)
 def fetch_image_for_post(
    sess: requests.Session, known_images, output_dir, post
 ) -> (Dict, int):
    post = copy.copy(post)
    converted = 0
    def _process_attr(attr, attr_value: str) -> str:
        nonlocal converted
        if not isinstance(attr_value, str):
            return attr_value
        print(post['id'], attr, attr_value)
        soup = bs4.BeautifulSoup(attr_value, features="lxml")
        for img in soup.find_all("img"):
            print(attr, "src", img["src"])
            img["src"] = fetch_image(sess, known_images, output_dir, img["src"])
            converted += 1
        return str(soup)
    def _process_url_attr(attr, attr_value: str) -> str:
        nonlocal converted
        print(post['id'], attr, attr_value)
        new_src = fetch_image(sess, known_images, output_dir, attr_value)
        converted += 1
        return new_src
    HTML_ATTRIBUTES = [
        "body",
        "caption",
        "trail.*.content",
        "trial.*.content_raw",
        "reblog.tree_html",
        "reblog.comment",
        "body_abstract",
        "trail.*.content_abstract",
        "answer",
        "summary",
    ]
    for attr in HTML_ATTRIBUTES:
        mutate_attr(post, attr.split("."), functools.partial(_process_attr, attr))
    URL_ATTRIBUTES = [
        "trail.*.content.*.media.*.url",
        "trail.*.content.*.media.url",
        "photos.*.original_size.url",
        "photos.*.panorama_size.url",
    ]
    for attr in URL_ATTRIBUTES:
        mutate_attr(post, attr.split("."), functools.partial(_process_url_attr, attr))
    # Try to find things that look like URLs.
    def _traverse(path, vs):
        if isinstance(vs, dict):
            for k, v in vs.items():
                for v in _traverse(path + [k], v):
                    yield v
        elif isinstance(vs, list):
            for k, v in enumerate(vs):
                for v in _traverse(path + ["*"], v):
                    yield v
        elif isinstance(vs, str):
            if "http://" in vs or "https://" in vs:
                yield ".".join(path)
    KNOWN_NOT_IMAGE = {
        "blog.url",
        "post_url",
        "short_url",
        "trail.*.blog.theme.header_image",
        "trail.*.blog.theme.header_image_focused",
        "trail.*.blog.theme.header_image_scaled",
        "photos.*.alt_sizes.*.url",
        "image_permalink",
        "blog.description",
        "link_url",
        "asking_avatar.*.url",
        "url",
        "asking_url",
        "trail.*.blog.theme.header_image_poster",
        "audio_source_url",
        "audio_url",
        "embed",
        "player",
        "source_url",
    }
    print(
        "MAYBEHTML",
        post["id"],
        sorted(set(_traverse([], post)) - KNOWN_NOT_IMAGE - set(HTML_ATTRIBUTES)),
    )
    return post, converted
 def fetch_images(
    client: pytumblr2.TumblrRestClient, from_path: Path, to_path: Path
 ) -> None:
    to_path.mkdir(parents=True, exist_ok=True)
    images_path = to_path / "_images"
    images_path.mkdir(parents=True, exist_ok=True)
    known_images = {}
    sess = requests.session()
    for child in from_path.iterdir():
        if not child.is_file():
            continue
        elif child.name.startswith("_"):
            shutil.copyfile(child, to_path / child.name)
            continue
        with open(child, "rt") as f:
            post = json.load(f)
        print(child, post["type"])
        new_post, post_converted = fetch_image_for_post(
            sess, known_images, images_path, post
        )
        new_child = to_path / child.name
        with open(new_child, "wt") as f:
            json.dump(new_post, f)
        print(child, new_child, post_converted)
 def main(argv: List[str]) -> None:
    if len(argv) < 2:
        raise app.UsageError("one arg, please")
    client = make_tumblr_client()
    client.legacy_conversion_on()
    if argv[1] == "dump_likes":
        if len(argv) != 3:
            raise app.UsageError("{} dump_likes <path>".format(argv[0]))
        dump_likes(client, Path(argv[2]))
    elif argv[1] == "dump_blog":
        if len(argv) != 4:
            raise app.UsageError("{} dump_blog <blog name> <path>".format(argv[0]))
        dump_blog(client, argv[2], Path(argv[3]))
    elif argv[1] == "fetch_images":
        if len(argv) != 4:
            raise app.UsageError("{} fetch_images <in_path> <out_path>".format(argv[0]))
        fetch_images(client, Path(argv[2]), Path(argv[3]))
    else:
        raise app.UsageError("unknown subcommand".format(argv[1]))
 if __name__ == "__main__":
    app.run(main)
--- a/py/tumblrcap/default.nix
+++ b/py/tumblrcap/default.nix
@ -0,0 +1,75 @@
 # SPDX-FileCopyrightText: 2023 Luke Granger-Brown <depot@lukegb.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 { depot, lib, pkgs, ... }@args:
 let
  pytumblr2 = ps: ps.buildPythonPackage rec {
    pname = "PyTumblr2";
    version = "0.2.2";
    src = ps.fetchPypi {
      inherit pname version;
      sha256 = "0xpl4v25kaywyr6fbanhsx9rpmbdvb7zs6hcj6m3md3ddkp9whkf";
    };
    propagatedBuildInputs = with ps; [
      future
      requests_oauthlib
    ];
    checkInputs = with ps; [
      nose
      nose-cov
      mock
    ];
  };
  python = pkgs.python3.withPackages (ps: with ps; [
    absl-py
    attrs
    beautifulsoup4
    requests
    requests_oauthlib
    (pytumblr2 ps)
  ]);
  filterSourcePred = (path: type: (type == "regular" &&
      lib.hasSuffix ".py" path ||
      lib.hasSuffix ".html" path
    ) || (
      type == "directory" &&
      baseNameOf path != "__pycache__" &&
      baseNameOf path != "node_modules" &&
      baseNameOf path != "config" &&
      baseNameOf path != "web" &&
      true));
  tumblrcap = pkgs.stdenvNoCC.mkDerivation rec {
    name = "tumblrcap";
    src = builtins.filterSource filterSourcePred ./.;
    buildInputs = with pkgs; [ makeWrapper ];
    propagatedBuildInputs = [ python ];
    installPhase = ''
      sitepkgdir="$out/lib/${python.libPrefix}/site-packages"
      pkgdir="$sitepkgdir/tumblrcap"
      mkdir -p $pkgdir
      cp -R \
        *.py \
        $pkgdir
      mkdir "$out/bin"
      makeWrapper "${python}/bin/python" "$out/bin/tumblrcap" \
        --add-flags "-m" \
        --add-flags "tumblrcap" \
        --suffix PYTHONPATH : "$sitepkgdir"
    '';
    passthru.pythonEnv = python;
  };
 in
 tumblrcap
--- a/py/tumblrcap/shell.nix
+++ b/py/tumblrcap/shell.nix
@ -0,0 +1,13 @@
 { depot ? import <depot> {} }:
 let 
  inherit (depot.py) tumblrcap;
  inherit (depot) pkgs;
 in pkgs.mkShell {
  buildInputs = with pkgs; [
    tumblrcap.pythonEnv
    black
    python3.pkgs.isort
  ];
 }