Skip to content

Commit 9da11a4

Browse files
Merge #1363
1363: Make recent downloads fast r=ashleygwilliams Every aspect of crates.io is extremely fast except one. Calculating recent downloads is a moderately expensive operation that we are doing too often. It's not "slow" per-se. The query takes 500ms to complete on average, which is not unreasonable. However, this query does put a good bit of load on the DB server. The root cause of #1304 was an irresponsible bot hitting /crates (which includes recent downloads) enough that it caused our DB server's CPU load to reach 100%. We're on a very low tier database server right now, so we could fix this by just upgrading to the next tier if we wanted to. However, as the number of crates grows, this problem will just get worse. Recent downloads appear in 3 places. Summary, search, and crate details. Summary and search are the two endpoints that crawlers want to hit, so we can't have slow queries on those pages. There are other solutions we could take here. We could just not include recent downloads on those pages, and include them only if a flag is set. We could also move recent downloads to another endpoint which gets hit separately. Both of these require an annoying amount of code though, and we need a crawler policy that says "don't hit these endpoints", which requires work for us to monitor those endpoints, etc. Ultimately we don't need this data to be real time (it already is slightly delayed anyway). We can just store this particular piece of data in a materialized view, which we refresh periodically. Right now I'm refreshing it whenever update-downloads runs, but I think we can go even further and update it daily if this becomes a scaling problem in the future. Even though repopulating the view currently only takes ~500ms (same amount of time as the query), I've opted to do it concurrently to avoid locking the table for long periods of time as we continue to scale. Crate details is still using the `crate_downloads` table, so it's a bit closer to real time. When we're selecting this for a single crate, the query is already fast enough. We may want to change this in the future. We could also consider getting rid of `crate_downloads` entirely, and populate the view from `version_downloads`. This will cause more CPU load when we refresh the view, but it probably doesn't matter.
2 parents 715f6a2 + c6101b5 commit 9da11a4

File tree

7 files changed

+63
-46
lines changed

7 files changed

+63
-46
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
DROP FUNCTION refresh_recent_crate_downloads();
2+
DROP INDEX recent_crate_downloads_crate_id;
3+
DROP MATERIALIZED VIEW recent_crate_downloads;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE MATERIALIZED VIEW recent_crate_downloads (crate_id, downloads) AS
2+
SELECT crate_id, SUM(downloads) FROM crate_downloads
3+
WHERE date > date(CURRENT_TIMESTAMP - INTERVAL '90 days')
4+
GROUP BY crate_id;
5+
CREATE UNIQUE INDEX recent_crate_downloads_crate_id ON recent_crate_downloads (crate_id);
6+
7+
CREATE FUNCTION refresh_recent_crate_downloads() RETURNS VOID AS $$
8+
REFRESH MATERIALIZED VIEW CONCURRENTLY recent_crate_downloads;
9+
$$ LANGUAGE SQL;

src/bin/update-downloads.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
extern crate cargo_registry;
44
extern crate chrono;
5+
#[macro_use]
56
extern crate diesel;
67

78
use diesel::prelude::*;
@@ -29,6 +30,7 @@ fn main() {
2930
}
3031

3132
fn update(conn: &PgConnection) -> QueryResult<()> {
33+
use diesel::select;
3234
use version_downloads::dsl::*;
3335

3436
let mut max = Some(0);
@@ -42,6 +44,9 @@ fn update(conn: &PgConnection) -> QueryResult<()> {
4244
collect(conn, &rows)?;
4345
max = rows.last().map(|d| d.id);
4446
}
47+
48+
no_arg_sql_function!(refresh_recent_crate_downloads, ());
49+
select(refresh_recent_crate_downloads).execute(conn)?;
4550
Ok(())
4651
}
4752

src/controllers/krate/metadata.rs

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ use models::krate::ALL_COLUMNS;
1414

1515
/// Handles the `GET /summary` route.
1616
pub fn summary(req: &mut Request) -> CargoResult<Response> {
17-
use diesel::sql_query;
1817
use schema::crates::dsl::*;
1918

2019
let conn = req.db_conn()?;
@@ -54,23 +53,12 @@ pub fn summary(req: &mut Request) -> CargoResult<Response> {
5453
.limit(10)
5554
.load(&*conn)?;
5655

57-
// This query needs to be structured in this way to have the LIMIT
58-
// happen before the joining/sorting for performance reasons.
59-
// It needs to use sql_query because Diesel doesn't have a great way
60-
// to join on subselects right now :(
61-
let most_recently_downloaded = sql_query(
62-
"SELECT crates.* \
63-
FROM crates \
64-
JOIN ( \
65-
SELECT crate_downloads.crate_id, SUM(crate_downloads.downloads) \
66-
FROM crate_downloads \
67-
WHERE crate_downloads.date > date(CURRENT_TIMESTAMP - INTERVAL '90 days') \
68-
GROUP BY crate_downloads.crate_id \
69-
ORDER BY SUM(crate_downloads.downloads) DESC NULLS LAST \
70-
LIMIT 10 \
71-
) cd ON crates.id = cd.crate_id \
72-
ORDER BY cd.sum DESC NULLS LAST",
73-
).load::<Crate>(&*conn)?;
56+
let most_recently_downloaded = crates
57+
.inner_join(recent_crate_downloads::table)
58+
.order(recent_crate_downloads::downloads.desc())
59+
.select(ALL_COLUMNS)
60+
.limit(10)
61+
.load(&*conn)?;
7462

7563
let popular_keywords = keywords::table
7664
.order(keywords::crates_cnt.desc())

src/controllers/krate/search.rs

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ use models::krate::{canon_crate_name, ALL_COLUMNS};
3232
/// function out to cover the different use cases, and create unit tests
3333
/// for them.
3434
pub fn search(req: &mut Request) -> CargoResult<Response> {
35-
use diesel::dsl::*;
36-
use diesel::sql_types::{BigInt, Bool, Nullable};
35+
use diesel::sql_types::Bool;
3736

3837
let conn = req.db_conn()?;
3938
let (offset, limit) = req.pagination(10, 100)?;
@@ -43,30 +42,15 @@ pub fn search(req: &mut Request) -> CargoResult<Response> {
4342
.map(|s| &**s)
4443
.unwrap_or("recent-downloads");
4544

46-
let recent_downloads = sql::<Nullable<BigInt>>("SUM(crate_downloads.downloads)");
47-
4845
let mut query = crates::table
49-
.left_join(
50-
crate_downloads::table.on(crates::id
51-
.eq(crate_downloads::crate_id)
52-
.and(crate_downloads::date.gt(date(now - 90.days())))),
53-
)
54-
.group_by(crates::id)
46+
.left_join(recent_crate_downloads::table)
5547
.select((
5648
ALL_COLUMNS,
5749
false.into_sql::<Bool>(),
58-
recent_downloads.clone(),
50+
recent_crate_downloads::downloads.nullable(),
5951
))
6052
.into_boxed();
6153

62-
if sort == "downloads" {
63-
query = query.order(crates::downloads.desc())
64-
} else if sort == "recent-downloads" {
65-
query = query.order(recent_downloads.clone().desc().nulls_last())
66-
} else {
67-
query = query.order(crates::name.asc())
68-
}
69-
7054
if let Some(q_string) = params.get("q") {
7155
let sort = params.get("sort").map(|s| &**s).unwrap_or("relevance");
7256
let q = plainto_tsquery(q_string);
@@ -78,16 +62,13 @@ pub fn search(req: &mut Request) -> CargoResult<Response> {
7862
query = query.select((
7963
ALL_COLUMNS,
8064
Crate::with_name(q_string),
81-
recent_downloads.clone(),
65+
recent_crate_downloads::downloads.nullable(),
8266
));
83-
let perfect_match = Crate::with_name(q_string).desc();
84-
if sort == "downloads" {
85-
query = query.order((perfect_match, crates::downloads.desc()));
86-
} else if sort == "recent-downloads" {
87-
query = query.order((perfect_match, recent_downloads.clone().desc().nulls_last()));
88-
} else {
67+
query = query.order(Crate::with_name(q_string).desc());
68+
69+
if sort == "relevance" {
8970
let rank = ts_rank_cd(crates::textsearchable_index_col, q);
90-
query = query.order((perfect_match, rank.desc()))
71+
query = query.then_order_by(rank.desc())
9172
}
9273
}
9374

@@ -156,6 +137,14 @@ pub fn search(req: &mut Request) -> CargoResult<Response> {
156137
);
157138
}
158139

140+
if sort == "downloads" {
141+
query = query.then_order_by(crates::downloads.desc())
142+
} else if sort == "recent-downloads" {
143+
query = query.then_order_by(recent_crate_downloads::downloads.desc().nulls_last())
144+
} else {
145+
query = query.then_order_by(crates::name.asc())
146+
}
147+
159148
// The database query returns a tuple within a tuple , with the root
160149
// tuple containing 3 items.
161150
let data = query

src/schema.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,24 @@ table! {
591591
}
592592
}
593593

594+
table! {
595+
/// Representation of the `recent_crate_downloads` view.
596+
///
597+
/// This data represents the downloads in the last 90 days.
598+
/// This view does not contain realtime data.
599+
/// It is refreshed by the `update-downloads` script.
600+
recent_crate_downloads (crate_id) {
601+
/// The `crate_id` column of the `recent_crate_downloads` view.
602+
///
603+
/// Its SQL type is `Integer`.
604+
crate_id -> Integer,
605+
/// The `downloads` column of the `recent_crate_downloads` table.
606+
///
607+
/// Its SQL type is `BigInt`.
608+
downloads -> BigInt,
609+
}
610+
}
611+
594612
table! {
595613
use diesel::sql_types::*;
596614
use diesel_full_text_search::{TsVector as Tsvector};
@@ -865,6 +883,7 @@ joinable!(emails -> users (user_id));
865883
joinable!(follows -> crates (crate_id));
866884
joinable!(follows -> users (user_id));
867885
joinable!(readme_renderings -> versions (version_id));
886+
joinable!(recent_crate_downloads -> crates (crate_id));
868887
joinable!(version_authors -> users (user_id));
869888
joinable!(version_authors -> versions (version_id));
870889
joinable!(version_downloads -> versions (version_id));
@@ -886,6 +905,7 @@ allow_tables_to_appear_in_same_query!(
886905
keywords,
887906
metadata,
888907
readme_renderings,
908+
recent_crate_downloads,
889909
reserved_crate_names,
890910
teams,
891911
users,

src/tests/all.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ impl<'a> CrateBuilder<'a> {
408408
}
409409

410410
fn build(mut self, connection: &PgConnection) -> CargoResult<Crate> {
411-
use diesel::{insert_into, update};
411+
use diesel::{insert_into, select, update};
412412

413413
let mut krate = self.krate
414414
.create_or_update(connection, None, self.owner_id)?;
@@ -444,6 +444,9 @@ impl<'a> CrateBuilder<'a> {
444444
insert_into(crate_downloads::table)
445445
.values(&crate_download)
446446
.execute(connection)?;
447+
448+
no_arg_sql_function!(refresh_recent_crate_downloads, ());
449+
select(refresh_recent_crate_downloads).execute(connection)?;
447450
}
448451

449452
if self.versions.is_empty() {

0 commit comments

Comments
 (0)