Files
modal-examples/misc/news_summarizer.py
Elias Freider e3b2d769a1 Adjust examples to use new CLI + local entrypoints (#172)
* Adjustments for cli

* Updating some more examples

* Use local entrypoints
2023-01-17 14:35:16 +01:00

188 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# # News article summarizer
#
# In this example we scrape news articles from the [New York Times'
# Science section](https://www.nytimes.com/section/science) and summarize them
# using Google's deep learning summarization model [Pegasus](https://ai.googleblog.com/2020/06/pegasus-state-of-art-model-for.html).
# We log the resulting summaries to the terminal, but you can do whatever you want with the
# summaries afterwards: saving to a CSV file, sending to Slack, etc.
import os
import re
from dataclasses import dataclass
from typing import List
import modal
# ## Building Images and Downloading Pre-trained Model
#
# We start by defining our images. In Modal, each function can use a different
# image. This is powerful because you add only the dependencies you need for
# each function.
stub = modal.Stub("example-news-summarizer")
MODEL_NAME = "google/pegasus-xsum"
CACHE_DIR = "/cache"
# The first image contains dependencies for running our model. We also download the
# pre-trained model into the image using the `huggingface` API. This caches the model so that
# we don't have to download it on every function call.
stub["deep_learning_image"] = modal.Image.debian_slim().pip_install("transformers==4.16.2", "torch", "sentencepiece")
# Defining the scraping image is very similar. This image only contains the packages required
# to scrape the New York Times website, though; so it's much smaller.
stub["scraping_image"] = modal.Image.debian_slim().pip_install("requests", "beautifulsoup4", "lxml")
volume = modal.SharedVolume().persist("pegasus-modal-vol")
# We will also instantiate the model and tokenizer globally so its available for all functions that use this image.
if stub.is_inside(stub["deep_learning_image"]):
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
TOKENIZER = PegasusTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
MODEL = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
if stub.is_inside(stub["scraping_image"]):
import requests
from bs4 import BeautifulSoup
# ## Collect Data
#
# Collecting data happens in two stages: first a list of URL articles
# using the NYT API then scrape the NYT web page for each of those articles
# to collect article texts.
@dataclass
class NYArticle:
title: str
image_url: str = ""
url: str = ""
summary: str = ""
text: str = ""
# In order to connect to the NYT API, you will need to sign up at [NYT Developer Portal](https://developer.nytimes.com/),
# create an Stub then grab an API key. Then head to Modal and create a [Secret](https://modal.com/docs/guide/secrets) called `nytimes`.
# Create an environment variable called `NYTIMES_API_KEY` with your API key.
@stub.function(secret=modal.Secret.from_name("nytimes"), image=stub["scraping_image"])
def latest_science_stories(n_stories: int = 5) -> List[NYArticle]:
# query api for latest science articles
params = {
"api-key": os.environ["NYTIMES_API_KEY"],
}
nyt_api_url = "https://api.nytimes.com/svc/topstories/v2/science.json"
response = requests.get(nyt_api_url, params=params)
# extract data from articles and return list of NYArticle objects
results = response.json()
reject_urls = {"null", "", None}
articles = [
NYArticle(
title=u["title"],
image_url=u.get("multimedia")[0]["url"] if u.get("multimedia") else "",
url=u.get("url"),
)
for u in results["results"]
if u.get("url") not in reject_urls
]
# select only a handful of articles; this usually returns 25 articles
articles = articles[:n_stories]
print(f"Retrieved {len(articles)} from the NYT Top Stories API")
return articles
# The NYT API only gives us article URLs but it doesn't include the article text. We'll get the article URLs
# from the API then scrape each URL for the article body. We'll be using
# [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) for that.
@stub.function(image=stub["scraping_image"])
def scrape_nyc_article(url: str) -> str:
print(f"Scraping article => {url}")
# fetch article; simulate desktop browser
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
# get all text paragraphs & construct single string with article text
article_text = ""
article_section = soup.find_all("div", {"class": re.compile(r"\bStoryBodyCompanionColumn\b")})
if article_section:
paragraph_tags = article_section[0].find_all("p")
article_text = " ".join([p.get_text() for p in paragraph_tags])
# return article with scraped text
return article_text
# Now the summarization function. We use `huggingface`'s Pegasus tokenizer and model implementation to
# generate a summary of the model. You can learn more about Pegasus does in the [HuggingFace
# documentation](https://huggingface.co/docs/transformers/model_doc/pegasus). Use `gpu="any"` to speed-up inference.
@stub.function(
image=stub["deep_learning_image"],
gpu=False,
shared_volumes={CACHE_DIR: volume},
memory=4096,
)
def summarize_article(text: str) -> str:
print(f"Summarizing text with {len(text)} characters.")
# summarize text
batch = TOKENIZER([text], truncation=True, padding="longest", return_tensors="pt").to("cpu")
translated = MODEL.generate(**batch)
summary = TOKENIZER.batch_decode(translated, skip_special_tokens=True)[0]
return summary
# ## Create a Scheduled Function
#
# Put everything together and schedule it to run every day. You can also use `modal.Cron` for a
# more advanced scheduling interface.
@stub.function(schedule=modal.Period(days=1))
def trigger():
articles = latest_science_stories.call()
# parallelize article scraping
for i, text in enumerate(scrape_nyc_article.map([a.url for a in articles])):
articles[i].text = text
# parallelize summarization
for i, summary in enumerate(summarize_article.map([a.text for a in articles if len(a.text) > 0])):
articles[i].summary = summary
# show all summaries in the terminal
for article in articles:
print(f'Summary of "{article.title}" => {article.summary}')
# Create a new Modal scheduled function with:
#
# ```shell
# modal deploy --name news_summarizer news_summarizer.py
# ```
# You can also run this entire Modal app in debugging mode before.
# call it with regular python as `python news_summarizer.py`
if __name__ == "__main__":
with stub.run():
trigger.call()
# And that's it. You will now generate deep learning summaries from the latest
# NYT Science articles every day.