* increment line limit to 81 * increment line length to 82 * increment line length to 83 * increment line length to 84 * increment line length to 85 * increment line length to 86 * increment line length to 87 * increment line length to 88
215 lines
7.0 KiB
Python
215 lines
7.0 KiB
Python
# # News article summarizer
|
|
#
|
|
# In this example we scrape news articles from the [New York Times'
|
|
# Science section](https://www.nytimes.com/section/science) and summarize them
|
|
# using Google's deep learning summarization model [Pegasus](https://ai.googleblog.com/2020/06/pegasus-state-of-art-model-for.html).
|
|
# We log the resulting summaries to the terminal, but you can do whatever you want with the
|
|
# summaries afterwards: saving to a CSV file, sending to Slack, etc.
|
|
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import List
|
|
|
|
import modal
|
|
|
|
app = modal.App(name="example-news-summarizer")
|
|
|
|
# ## Building Images and Downloading Pre-trained Model
|
|
#
|
|
# We start by defining our images. In Modal, each function can use a different
|
|
# image. This is powerful because you add only the dependencies you need for
|
|
# each function.
|
|
|
|
# The first image contains dependencies for running our model. We also download the
|
|
# pre-trained model into the image using the `from_pretrained` method.
|
|
# This caches the model so that we don't have to download it on every function call.
|
|
# The model will be saved at `/cache` when this function is called at image build time;
|
|
# subsequent calls of this function at runtime will then load the model from `/cache`.
|
|
|
|
|
|
def fetch_model(local_files_only: bool = False):
|
|
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
|
|
|
tokenizer = PegasusTokenizer.from_pretrained(
|
|
"google/pegasus-xsum",
|
|
cache_dir="/cache",
|
|
local_files_only=local_files_only,
|
|
)
|
|
model = PegasusForConditionalGeneration.from_pretrained(
|
|
"google/pegasus-xsum",
|
|
cache_dir="/cache",
|
|
local_files_only=local_files_only,
|
|
)
|
|
return model, tokenizer
|
|
|
|
|
|
deep_learning_image = (
|
|
modal.Image.debian_slim()
|
|
.pip_install("transformers==4.16.2", "torch", "sentencepiece")
|
|
.run_function(fetch_model)
|
|
)
|
|
|
|
# Defining the scraping image is very similar. This image only contains the packages required
|
|
# to scrape the New York Times website, though; so it's much smaller.
|
|
scraping_image = modal.Image.debian_slim().pip_install(
|
|
"requests", "beautifulsoup4", "lxml"
|
|
)
|
|
|
|
|
|
with scraping_image.imports():
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
# ## Collect Data
|
|
#
|
|
# Collecting data happens in two stages: first a list of URL articles
|
|
# using the NYT API then scrape the NYT web page for each of those articles
|
|
# to collect article texts.
|
|
|
|
|
|
@dataclass
|
|
class NYArticle:
|
|
title: str
|
|
image_url: str = ""
|
|
url: str = ""
|
|
summary: str = ""
|
|
text: str = ""
|
|
|
|
|
|
# In order to connect to the NYT API, you will need to sign up at [NYT Developer Portal](https://developer.nytimes.com/),
|
|
# create an App then grab an API key. Then head to Modal and create a [Secret](https://modal.com/docs/guide/secrets) called `nytimes`.
|
|
# Create an environment variable called `NYTIMES_API_KEY` with your API key.
|
|
|
|
|
|
@app.function(
|
|
secrets=[modal.Secret.from_name("nytimes")],
|
|
image=scraping_image,
|
|
)
|
|
def latest_science_stories(n_stories: int = 5) -> List[NYArticle]:
|
|
# query api for latest science articles
|
|
params = {
|
|
"api-key": os.environ["NYTIMES_API_KEY"],
|
|
}
|
|
nyt_api_url = "https://api.nytimes.com/svc/topstories/v2/science.json"
|
|
response = requests.get(nyt_api_url, params=params)
|
|
|
|
# extract data from articles and return list of NYArticle objects
|
|
results = response.json()
|
|
reject_urls = {"null", "", None}
|
|
articles = [
|
|
NYArticle(
|
|
title=u["title"],
|
|
image_url=(u.get("multimedia")[0]["url"] if u.get("multimedia") else ""),
|
|
url=u.get("url"),
|
|
)
|
|
for u in results["results"]
|
|
if u.get("url") not in reject_urls
|
|
]
|
|
|
|
# select only a handful of articles; this usually returns 25 articles
|
|
articles = articles[:n_stories]
|
|
print(f"Retrieved {len(articles)} from the NYT Top Stories API")
|
|
return articles
|
|
|
|
|
|
# The NYT API only gives us article URLs but it doesn't include the article text. We'll get the article URLs
|
|
# from the API then scrape each URL for the article body. We'll be using
|
|
# [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) for that.
|
|
|
|
|
|
@app.function(image=scraping_image)
|
|
def scrape_nyc_article(url: str) -> str:
|
|
print(f"Scraping article => {url}")
|
|
|
|
# fetch article; simulate desktop browser
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9"
|
|
}
|
|
response = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
# get all text paragraphs & construct single string with article text
|
|
article_text = ""
|
|
article_section = soup.find_all(
|
|
"div", {"class": re.compile(r"\bStoryBodyCompanionColumn\b")}
|
|
)
|
|
if article_section:
|
|
paragraph_tags = article_section[0].find_all("p")
|
|
article_text = " ".join([p.get_text() for p in paragraph_tags])
|
|
|
|
# return article with scraped text
|
|
return article_text
|
|
|
|
|
|
# Now the summarization function. We use `huggingface`'s Pegasus tokenizer and model implementation to
|
|
# generate a summary of the model. You can learn more about Pegasus does in the [HuggingFace
|
|
# documentation](https://huggingface.co/docs/transformers/model_doc/pegasus). Use `gpu="any"` to speed-up inference.
|
|
|
|
|
|
@app.function(
|
|
image=deep_learning_image,
|
|
gpu=False,
|
|
memory=4096,
|
|
)
|
|
def summarize_article(text: str) -> str:
|
|
print(f"Summarizing text with {len(text)} characters.")
|
|
|
|
# `local_files_only` is set to `True` because we expect to read the model
|
|
# files saved in the image.
|
|
model, tokenizer = fetch_model(local_files_only=True)
|
|
|
|
# summarize text
|
|
batch = tokenizer(
|
|
[text], truncation=True, padding="longest", return_tensors="pt"
|
|
).to("cpu")
|
|
translated = model.generate(**batch)
|
|
summary = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
|
|
|
|
return summary
|
|
|
|
|
|
# ## Create a Scheduled Function
|
|
#
|
|
# Put everything together and schedule it to run every day. You can also use `modal.Cron` for a
|
|
# more advanced scheduling interface.
|
|
|
|
|
|
@app.function(schedule=modal.Period(days=1))
|
|
def trigger():
|
|
articles = latest_science_stories.remote()
|
|
|
|
# parallelize article scraping
|
|
for i, text in enumerate(scrape_nyc_article.map([a.url for a in articles])):
|
|
articles[i].text = text
|
|
|
|
# parallelize summarization
|
|
for i, summary in enumerate(
|
|
summarize_article.map([a.text for a in articles if len(a.text) > 0])
|
|
):
|
|
articles[i].summary = summary
|
|
|
|
# show all summaries in the terminal
|
|
for article in articles:
|
|
print(f'Summary of "{article.title}" => {article.summary}')
|
|
|
|
|
|
# Create a new Modal scheduled function with:
|
|
#
|
|
# ```shell
|
|
# modal deploy --name news_summarizer news_summarizer.py
|
|
# ```
|
|
|
|
# You can also run this entire Modal app in debugging mode before.
|
|
# call it with `modal run news_summarizer.py`
|
|
|
|
|
|
@app.local_entrypoint()
|
|
def main():
|
|
trigger.remote()
|
|
|
|
|
|
# And that's it. You will now generate deep learning summaries from the latest
|
|
# NYT Science articles every day.
|