Skip to content

Commit 2e6840c

Browse files
committed
add generate_main_md_file
1 parent 461c16c commit 2e6840c

1 file changed

Lines changed: 82 additions & 0 deletions

File tree

src/substack2markdown/substack_scraper.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename"
4141
DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md"
4242
DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html"
43+
DEFAULT_POSTS_MD_PATH_FORMAT = "readme.md"
4344
DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html"
4445
DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json"
4546
DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json"
@@ -119,6 +120,7 @@ def __init__(self, args):
119120
self.md_path_template = string.Template(self.args.md_path_format)
120121
self.html_path_template = string.Template(self.args.html_path_format)
121122
self.image_path_template = string.Template(self.args.image_path_format)
123+
self.posts_md_path_template = string.Template(self.args.posts_md_path_format)
122124
self.posts_html_path_template = string.Template(self.args.posts_html_path_format)
123125
self.posts_json_path_template = string.Template(self.args.posts_json_path_format)
124126
self.post_json_path_template = string.Template(self.args.post_json_path_format)
@@ -689,8 +691,77 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
689691
if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
690692
break
691693
self.save_posts_data_json(posts_data)
694+
self.generate_main_md_file()
692695
self.generate_main_html_file()
693696

697+
def generate_main_md_file(self) -> None:
698+
"""
699+
Generates a Markdown file for the given author.
700+
"""
701+
# Read JSON data
702+
posts_json_path = os.path.join(
703+
self.format_vars["output_directory"],
704+
self.posts_json_path_template.substitute(self.format_vars)
705+
)
706+
with open(posts_json_path, 'r', encoding='utf-8') as file:
707+
posts_data = json.load(file)
708+
709+
# sort by post_id, descending
710+
posts_data.sort(key=lambda p: -1*p["id"])
711+
712+
last_post = posts_data[0]
713+
last_post_json_path = last_post["post_json"]
714+
last_post_json_path = os.path.join(
715+
os.path.dirname(posts_json_path),
716+
last_post_json_path
717+
)
718+
719+
with open(last_post_json_path, 'r', encoding='utf-8') as file:
720+
last_post = json.load(file)
721+
722+
publication = last_post["pub"]
723+
724+
md_output_path = os.path.join(
725+
self.format_vars["output_directory"],
726+
self.posts_md_path_template.substitute(self.format_vars)
727+
)
728+
729+
with open(md_output_path, 'w', encoding='utf-8') as file:
730+
file.write(f'# {publication["name"]}\n')
731+
file.write('\n')
732+
# author_url = f'https://substack.com/@{publication["author_handle"]}' # variable
733+
author_url = f'https://substack.com/profile/{publication["author_id"]}' # constant
734+
file.write(f'by [{publication["author_name"]}]({author_url})\n')
735+
file.write('\n')
736+
author_bio = publication["author_bio"].replace("\n", "\n\n")
737+
file.write(f'{author_bio}\n')
738+
file.write('\n')
739+
file.write('\n')
740+
file.write('\n')
741+
file.write('## Posts\n')
742+
file.write('\n')
743+
for post in posts_data:
744+
# TODO use args.datetime_format
745+
post_date = post["date"]
746+
post_link = (
747+
'<a id="post' +
748+
str(post["id"]) +
749+
'" href="' +
750+
post["file_link"] +
751+
'" title="' +
752+
post["subtitle"].replace('"', '&quot;') +
753+
'">' +
754+
post["title"].replace('<', '&lt;') +
755+
'</a>'
756+
)
757+
if post["like_count"] > 0:
758+
post_link += f" ❤" + str(post["like_count"]) # "❤123"
759+
if post["comment_count"] > 0:
760+
post_link += f" 🗨" + str(post["comment_count"]) # "🗨123"
761+
if post["repost_count"] > 0:
762+
post_link += f" ↻" + str(post["repost_count"]) # "↻123"
763+
file.write(f'- {post_date} - {post_link}\n')
764+
694765
def generate_main_html_file(self) -> None:
695766
"""
696767
Generates a HTML file for the given author.
@@ -706,6 +777,11 @@ def generate_main_html_file(self) -> None:
706777
# Convert JSON data to a JSON string for embedding
707778
embedded_json_data = json.dumps(posts_data, **json_dump_kwargs)
708779

780+
md_output_path = os.path.join(
781+
self.format_vars["output_directory"],
782+
self.posts_md_path_template.substitute(self.format_vars)
783+
)
784+
709785
html_output_path = os.path.join(
710786
self.format_vars["output_directory"],
711787
self.posts_html_path_template.substitute(self.format_vars)
@@ -1069,6 +1145,12 @@ def parse_args() -> argparse.Namespace:
10691145
default=DEFAULT_IMAGE_PATH_FORMAT,
10701146
help=f"The file path format to save scraped image files. Default: {DEFAULT_IMAGE_PATH_FORMAT!r}",
10711147
)
1148+
parser.add_argument(
1149+
"--posts-md-path-format", # args.posts_md_path_format
1150+
type=str,
1151+
default=DEFAULT_POSTS_MD_PATH_FORMAT,
1152+
help=f"The file path format to save an index of scraped posts as Markdown file. Default: {DEFAULT_POSTS_MD_PATH_FORMAT!r}",
1153+
)
10721154
parser.add_argument(
10731155
"--posts-html-path-format", # args.posts_html_path_format
10741156
type=str,

0 commit comments

Comments
 (0)