import json import logging from collections import defaultdict import parse import conf def clean_url(u): if "//" in u: return u.split("//", 1)[1].split("?", 1)[0] return u BASE_URL = clean_url(conf.SITE_URL) data = json.load(open("disqus-data.json")) # Each fix is a parse pattern and a format pattern. Get data using one, set data using the other. thread_link_fixes = [ [ "{}://web.archive.org/web/{:d}/http://lateral.netmanagers.com.ar/{path}", "http://ralsina.me/{path}", ], ["{}://lateral.netmanagers.com.ar/{path}", "http://ralsina.me/{path}"], ["{}://localhost:8080/{path}", "http://ralsina.me/{path}"], [ "http://feedproxy.google.com/~r/LateralOpinion/~3/{}/{path}", "http://ralsina.me/weblog/posts/{path}", ], [ "https://disqus.com/home/discussion/lateralopinion/bb{numero1:d}_{numero2:d}/", "http://ralsina.me/weblog/posts/BB{numero1}.{numero2:02d}.html", ], ["{}://example.com/posts/{file}", "http://ralsina.me/weblog/posts/{file}"], ["{start}/bbs{id:d}", "{start}/BBS{id:d}"], # bbs -> BBS [ "{}://ralsina.me/tr/es/{path}", "http://ralsina.me/{path}", ], # unify translation comments [ "{}://ralsina.me/weblog/posts/{folder}/", "http://ralsina.me/weblog/posts/{folder}.html", ], # unprettify URLs [ "https://ralsina.me/weblog/posts/old-guy-the-terminal-ep-2-puede-fallar.html", "https://ralsina.me/weblog/posts/old-guy-the-terminal-ep-3-puede-fallar.html", ], ["https://ralsina.me/stories/nombres.html", "https://ralsina.me/stories/nombres/"], ] # Ancient test threads ignore_links = { "http://ralsina.me/weblog/posts/xxx.html", "http://ubuntuone.com/p/lKX/", "http://ralsina.me/weblog/posts/index.html", } # Collect all the post/page URLs in the site site_urls = {} for p in site.timeline: site_urls[clean_url(p.permalink(absolute=True))] = p posts_per_thread = defaultdict(list) posts_by_id = {} for post in data["disqus"]["post"]: posts_per_thread[post["thread"]["@dsq:id"]].append(post) posts_by_id[post["@dsq:id"]] = post threads = {} for t in data["disqus"]["thread"]: if t["@dsq:id"] not in posts_per_thread: # Empty thread, don't care continue if t["link"] in ignore_links: print(f'Ignoring {t["link"]}') continue if not clean_url(t["link"]) in site_urls: for parser, formatter in thread_link_fixes: parsed = parse.parse(parser, t["link"]) if parsed is not None: t["link"] = formatter.format(**parsed.named) if clean_url(t["link"]) in site_urls: break else: print(f'Unfixed thread link: {t["link"]}') threads[t["@dsq:id"]] = t def find_post_for_thread(thread, site): link = thread['link'] for p in site.timeline: if p.permalink(absolute=True).split('//')[1] == link.split('//')[1]: return p for t in threads: post = find_post_for_thread(threads[t], site) if post is None: print('Orphan thread ===>', threads[t]) continue base_path = post.source_path.split('.')[0] for post in posts_per_thread[t]: if post['isDeleted'] == 'true': continue comment_path = f"{base_path}.{post['@dsq:id']}.wpcomment" print(comment_path) with open(comment_path, "w") as outf: output = f""".. id: {post['@dsq:id']} .. approved: True .. author: {post['author']['name']} .. date_utc: {post['createdAt']} .. compiler: html {post['message']}""" if 'parent' in post: parent = post['parent']['@dsq:id'].split('=')[-1] output = f".. parent_id: {parent}\n" + output outf.write(output)