4040DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename"
4141DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md"
4242DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html"
43+ DEFAULT_POSTS_MD_PATH_FORMAT = "readme.md"
4344DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html"
4445DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json"
4546DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json"
@@ -119,6 +120,7 @@ def __init__(self, args):
119120 self .md_path_template = string .Template (self .args .md_path_format )
120121 self .html_path_template = string .Template (self .args .html_path_format )
121122 self .image_path_template = string .Template (self .args .image_path_format )
123+ self .posts_md_path_template = string .Template (self .args .posts_md_path_format )
122124 self .posts_html_path_template = string .Template (self .args .posts_html_path_format )
123125 self .posts_json_path_template = string .Template (self .args .posts_json_path_format )
124126 self .post_json_path_template = string .Template (self .args .post_json_path_format )
@@ -689,8 +691,77 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
689691 if num_posts_to_scrape != 0 and count == num_posts_to_scrape :
690692 break
691693 self .save_posts_data_json (posts_data )
694+ self .generate_main_md_file ()
692695 self .generate_main_html_file ()
693696
697+ def generate_main_md_file (self ) -> None :
698+ """
699+ Generates a Markdown file for the given author.
700+ """
701+ # Read JSON data
702+ posts_json_path = os .path .join (
703+ self .format_vars ["output_directory" ],
704+ self .posts_json_path_template .substitute (self .format_vars )
705+ )
706+ with open (posts_json_path , 'r' , encoding = 'utf-8' ) as file :
707+ posts_data = json .load (file )
708+
709+ # sort by post_id, descending
710+ posts_data .sort (key = lambda p : - 1 * p ["id" ])
711+
712+ last_post = posts_data [0 ]
713+ last_post_json_path = last_post ["post_json" ]
714+ last_post_json_path = os .path .join (
715+ os .path .dirname (posts_json_path ),
716+ last_post_json_path
717+ )
718+
719+ with open (last_post_json_path , 'r' , encoding = 'utf-8' ) as file :
720+ last_post = json .load (file )
721+
722+ publication = last_post ["pub" ]
723+
724+ md_output_path = os .path .join (
725+ self .format_vars ["output_directory" ],
726+ self .posts_md_path_template .substitute (self .format_vars )
727+ )
728+
729+ with open (md_output_path , 'w' , encoding = 'utf-8' ) as file :
730+ file .write (f'# { publication ["name" ]} \n ' )
731+ file .write ('\n ' )
732+ # author_url = f'https://substack.com/@{publication["author_handle"]}' # variable
733+ author_url = f'https://substack.com/profile/{ publication ["author_id" ]} ' # constant
734+ file .write (f'by [{ publication ["author_name" ]} ]({ author_url } )\n ' )
735+ file .write ('\n ' )
736+ author_bio = publication ["author_bio" ].replace ("\n " , "\n \n " )
737+ file .write (f'{ author_bio } \n ' )
738+ file .write ('\n ' )
739+ file .write ('\n ' )
740+ file .write ('\n ' )
741+ file .write ('## Posts\n ' )
742+ file .write ('\n ' )
743+ for post in posts_data :
744+ # TODO use args.datetime_format
745+ post_date = post ["date" ]
746+ post_link = (
747+ '<a id="post' +
748+ str (post ["id" ]) +
749+ '" href="' +
750+ post ["file_link" ] +
751+ '" title="' +
752+ post ["subtitle" ].replace ('"' , '"' ) +
753+ '">' +
754+ post ["title" ].replace ('<' , '<' ) +
755+ '</a>'
756+ )
757+ if post ["like_count" ] > 0 :
758+ post_link += f" ❤" + str (post ["like_count" ]) # "❤123"
759+ if post ["comment_count" ] > 0 :
760+ post_link += f" 🗨" + str (post ["comment_count" ]) # "🗨123"
761+ if post ["repost_count" ] > 0 :
762+ post_link += f" ↻" + str (post ["repost_count" ]) # "↻123"
763+ file .write (f'- { post_date } - { post_link } \n ' )
764+
694765 def generate_main_html_file (self ) -> None :
695766 """
696767 Generates a HTML file for the given author.
@@ -706,6 +777,11 @@ def generate_main_html_file(self) -> None:
706777 # Convert JSON data to a JSON string for embedding
707778 embedded_json_data = json .dumps (posts_data , ** json_dump_kwargs )
708779
780+ md_output_path = os .path .join (
781+ self .format_vars ["output_directory" ],
782+ self .posts_md_path_template .substitute (self .format_vars )
783+ )
784+
709785 html_output_path = os .path .join (
710786 self .format_vars ["output_directory" ],
711787 self .posts_html_path_template .substitute (self .format_vars )
@@ -1069,6 +1145,12 @@ def parse_args() -> argparse.Namespace:
10691145 default = DEFAULT_IMAGE_PATH_FORMAT ,
10701146 help = f"The file path format to save scraped image files. Default: { DEFAULT_IMAGE_PATH_FORMAT !r} " ,
10711147 )
1148+ parser .add_argument (
1149+ "--posts-md-path-format" , # args.posts_md_path_format
1150+ type = str ,
1151+ default = DEFAULT_POSTS_MD_PATH_FORMAT ,
1152+ help = f"The file path format to save an index of scraped posts as Markdown file. Default: { DEFAULT_POSTS_MD_PATH_FORMAT !r} " ,
1153+ )
10721154 parser .add_argument (
10731155 "--posts-html-path-format" , # args.posts_html_path_format
10741156 type = str ,
0 commit comments