From 7087a19ca7dbbcba7dfc33b232bcd8a368dc4b16 Mon Sep 17 00:00:00 2001 From: Jorrit Schaap <schaap@astron.nl> Date: Thu, 7 Dec 2017 12:11:28 +0000 Subject: [PATCH] Task #8721: many optimizations. added tests. added quota. --- .gitattributes | 2 +- LTA/ltastorageoverview/bin/CMakeLists.txt | 2 +- LTA/ltastorageoverview/lib/CMakeLists.txt | 7 +- .../lib/create_db_ltastorageoverview.sql | 623 ------------ .../ltaso/create_db_ltastorageoverview.sql | 962 ++++++++++++++++++ LTA/ltastorageoverview/lib/scraper.py | 287 ++++-- LTA/ltastorageoverview/lib/store.py | 252 ++--- .../lib/webservice/templates/index.html | 88 +- .../lib/webservice/webservice.py | 98 +- .../test/db_performance_test.py | 49 +- LTA/ltastorageoverview/test/test_store.py | 409 +++++--- 11 files changed, 1760 insertions(+), 1019 deletions(-) delete mode 100644 LTA/ltastorageoverview/lib/create_db_ltastorageoverview.sql create mode 100644 LTA/ltastorageoverview/lib/ltaso/create_db_ltastorageoverview.sql diff --git a/.gitattributes b/.gitattributes index 14e9faffde7..8da125420af 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3001,7 +3001,7 @@ LTA/ltastorageoverview/bin/ltastorageoverviewwebservice -text LTA/ltastorageoverview/bin/ltastorageoverviewwebservice.ini -text LTA/ltastorageoverview/lib/CMakeLists.txt -text LTA/ltastorageoverview/lib/__init__.py -text -LTA/ltastorageoverview/lib/create_db_ltastorageoverview.sql -text +LTA/ltastorageoverview/lib/ltaso/create_db_ltastorageoverview.sql -text LTA/ltastorageoverview/lib/report.py -text LTA/ltastorageoverview/lib/scraper.py -text LTA/ltastorageoverview/lib/store.py -text diff --git a/LTA/ltastorageoverview/bin/CMakeLists.txt b/LTA/ltastorageoverview/bin/CMakeLists.txt index 444362ad514..09002fec02f 100644 --- a/LTA/ltastorageoverview/bin/CMakeLists.txt +++ b/LTA/ltastorageoverview/bin/CMakeLists.txt @@ -1,6 +1,6 @@ # $Id$ -lofar_add_bin_scripts(ltastorageoverviewscraper ltastorageoverviewwebservice) +lofar_add_bin_scripts(ltastorageoverviewscraper ltastorageoverviewwebservice copymcu005) # supervisord config files install(FILES diff --git a/LTA/ltastorageoverview/lib/CMakeLists.txt b/LTA/ltastorageoverview/lib/CMakeLists.txt index be9f8a021fa..d9a9a63929c 100644 --- a/LTA/ltastorageoverview/lib/CMakeLists.txt +++ b/LTA/ltastorageoverview/lib/CMakeLists.txt @@ -9,15 +9,14 @@ python_install( webservice/__init__.py DESTINATION lofar/lta/ltastorageoverview) +set(sql_files ltaso/create_db_ltastorageoverview.sql) +lofar_add_data_files(${sql_files}) + set(web_files webservice/templates/index.html) install(FILES ${web_files} DESTINATION ${PYTHON_INSTALL_DIR}/lofar/lta/ltastorageoverview/webservice/templates/) -set(sql_files create_db_ltastorageoverview.sql) - -install_files(/share/ltaso/sql FILES ${sql_files}) - foreach(web_file ${web_files}) get_filename_component(web_file_path ${web_file} PATH) get_filename_component(web_file_abs ${web_file} ABSOLUTE) diff --git a/LTA/ltastorageoverview/lib/create_db_ltastorageoverview.sql b/LTA/ltastorageoverview/lib/create_db_ltastorageoverview.sql deleted file mode 100644 index f039ad4a903..00000000000 --- a/LTA/ltastorageoverview/lib/create_db_ltastorageoverview.sql +++ /dev/null @@ -1,623 +0,0 @@ -/* -# Copyright (C) 2012-2015 asTRON (Netherlands Institute for Radio Astronomy) -# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -# -# This file is part of the LOFAR software suite. -# The LOFAR software suite is free software: you can redistribute it and/or -# modify it under the terms of the GNU General Public License as published -# by the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# The LOFAR software suite is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -*/ - --- $Id$ - --- postgresql create script for ltastorageoverview database - --- run from command line as: --- psql ltaso -f create_db_ltastorageoverview.sql -W - --- \set VERBOSITY terse - -BEGIN; - -DROP SCHEMA IF EXISTS lta CASCADE; -DROP SCHEMA IF EXISTS scraper CASCADE; -DROP SCHEMA IF EXISTS metainfo CASCADE; - -CREATE SCHEMA lta; -CREATE SCHEMA scraper; -CREATE SCHEMA metainfo; - --- TABLES - -CREATE TABLE lta.storage_site ( - id serial, - name text UNIQUE NOT NULL, - url text UNIQUE NOT NULL, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE INDEX ss_name_idx on lta.storage_site(name); - -CREATE TABLE lta.directory ( - id serial, - name text NOT NULL, - parent_directory_id integer REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - PRIMARY KEY (id), - UNIQUE (name, parent_directory_id) -) WITH (OIDS=FALSE); - - -CREATE TABLE lta.directory_closure ( - ancestor_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - descendant_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - depth integer NOT NULL, - primary key (ancestor_id, descendant_id) -) WITH (OIDS=FALSE); - -CREATE INDEX dc_ancestor_id_idx on lta.directory_closure(ancestor_id); -CREATE INDEX dc_descendant_id_idx on lta.directory_closure(descendant_id); -CREATE INDEX dc_depth_idx on lta.directory_closure(depth); - -CREATE TABLE lta.storage_site_root_dir ( - storage_site_id integer NOT NULL REFERENCES lta.storage_site ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - directory_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - primary key (storage_site_id, directory_id) -) WITH (OIDS=FALSE); - -CREATE INDEX ssr_storage_site_id_idx on lta.storage_site_root_dir(storage_site_id); -CREATE INDEX ssr_directory_id_idx on lta.storage_site_root_dir(directory_id); - -CREATE TABLE lta.fileinfo ( - id serial, - name text NOT NULL, - size bigint NOT NULL, - creation_date timestamp NOT NULL, - directory_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - PRIMARY KEY (id), - UNIQUE (name, directory_id) -) WITH (OIDS=FALSE); - -CREATE INDEX fi_directory_id_idx on lta.fileinfo(directory_id); -CREATE INDEX fi_creation_date_idx on lta.fileinfo(creation_date); - -CREATE TABLE lta._changed_fileinfo_cache ( - fileinfo_id integer NOT NULL REFERENCES lta.fileinfo ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - directory_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - PRIMARY KEY (fileinfo_id) -) WITH (OIDS=FALSE); - -CREATE TABLE scraper.last_directory_visit ( - id serial, - directory_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - visit_date timestamp NOT NULL, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE TABLE metainfo.tree_stats ( - id serial, - tree_root_directory_id integer UNIQUE NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - num_files integer, - total_file_size bigint, - min_file_size bigint, - max_file_size bigint, - min_file_creation_date timestamp, - max_file_creation_date timestamp, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE INDEX ts_tree_root_directory_id_idx on metainfo.tree_stats(tree_root_directory_id); -CREATE INDEX ts_min_file_creation_date_idx on metainfo.tree_stats(min_file_creation_date); -CREATE INDEX ts_max_file_creation_date_idx on metainfo.tree_stats(max_file_creation_date); - -CREATE TABLE metainfo.directory_stats ( - id serial, - directory_id integer UNIQUE NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - num_files integer, - total_file_size bigint, - min_file_size bigint, - max_file_size bigint, - min_file_creation_date timestamp, - max_file_creation_date timestamp, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE INDEX ds_directory_id_idx on metainfo.directory_stats(directory_id); -CREATE INDEX ds_min_file_creation_date_idx on metainfo.directory_stats(min_file_creation_date); -CREATE INDEX ds_max_file_creation_date_idx on metainfo.directory_stats(max_file_creation_date); - -CREATE TABLE metainfo.project ( - id serial, - name text UNIQUE NOT NULL, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE INDEX project_name_idx on metainfo.project(name); - -CREATE TABLE metainfo.project_top_level_directory ( - project_id integer NOT NULL REFERENCES metainfo.project ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - directory_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - primary key (project_id, directory_id) -) WITH (OIDS=FALSE); - -CREATE TABLE metainfo.observation ( - id int, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE TABLE metainfo.project_observation ( - id serial, - project_id integer NOT NULL REFERENCES metainfo.project ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - observation_id integer NOT NULL REFERENCES metainfo.observation ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE TABLE metainfo.dataproducts ( - id serial, - fileinfo_id integer NOT NULL REFERENCES lta.fileinfo ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - observation_id integer NOT NULL REFERENCES metainfo.observation ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, - name text NOT NULL, - PRIMARY KEY (id) -) WITH (OIDS=FALSE); - -CREATE INDEX dp_dataproduct_name_idx on metainfo.dataproducts(name); - --- END TABLES - - --- TRIGGERS - -CREATE OR REPLACE FUNCTION lta.on_directory_inserted_add_directory_closure_entry() - RETURNS trigger AS -$BODY$ -BEGIN - INSERT INTO lta.directory_closure (ancestor_id, descendant_id, depth) values (NEW.id, NEW.id, 0) ; - - INSERT INTO lta.directory_closure (ancestor_id, descendant_id, depth) - SELECT p.ancestor_id, c.descendant_id, p.depth+c.depth+1 - FROM lta.directory_closure p, lta.directory_closure c - WHERE p.descendant_id=new.parent_directory_id AND c.ancestor_id=new.id ; - - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_directory_inserted_add_directory_closure_entry - AFTER INSERT - ON lta.directory - FOR EACH ROW - EXECUTE PROCEDURE lta.on_directory_inserted_add_directory_closure_entry(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_fileinfo_inserting_lock_changed_fileinfo_cache() - RETURNS trigger AS -$BODY$ -DECLARE -BEGIN - LOCK TABLE lta._changed_fileinfo_cache IN EXCLUSIVE MODE; - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_fileinfo_inserting_lock_changed_fileinfo_cache - BEFORE INSERT - ON lta.fileinfo - FOR EACH STATEMENT - EXECUTE PROCEDURE lta.on_fileinfo_inserting_lock_changed_fileinfo_cache(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_fileinfo_inserted_add_to_cache() - RETURNS trigger AS -$BODY$ -DECLARE -BEGIN - INSERT INTO lta._changed_fileinfo_cache (fileinfo_id,directory_id) - VALUES (NEW.id, NEW.directory_id); - - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_fileinfo_inserted_add_to_cache - AFTER INSERT - ON lta.fileinfo - FOR EACH ROW - EXECUTE PROCEDURE lta.on_fileinfo_inserted_add_to_cache(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_fileinfo_inserted_add_directory_stats() - RETURNS trigger AS -$BODY$ -DECLARE -touched_dir_rec record; -dc_rec record; -BEGIN - FOR touched_dir_rec IN (SELECT DISTINCT ON (directory_id) directory_id FROM lta._changed_fileinfo_cache) LOOP - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.directory_stats (directory_id) - (SELECT touched_dir_rec.directory_id WHERE NOT EXISTS (SELECT directory_id FROM metainfo.directory_stats WHERE directory_id = touched_dir_rec.directory_id)); - - DROP TABLE IF EXISTS temp_fileinfo_for_dirstats; - - CREATE TEMPORARY TABLE temp_fileinfo_for_dirstats - AS (SELECT fi.size, fi.creation_date - FROM lta.fileinfo fi - WHERE fi.directory_id = touched_dir_rec.directory_id) ; - - UPDATE metainfo.directory_stats SET - num_files=(SELECT count(size) FROM temp_fileinfo_for_dirstats), - total_file_size=(SELECT sum(size) FROM temp_fileinfo_for_dirstats), - min_file_size=(SELECT min(size) FROM temp_fileinfo_for_dirstats), - max_file_size=(SELECT max(size) FROM temp_fileinfo_for_dirstats), - min_file_creation_date=(SELECT min(creation_date) FROM temp_fileinfo_for_dirstats), - max_file_creation_date=(SELECT max(creation_date) FROM temp_fileinfo_for_dirstats) - WHERE directory_id = touched_dir_rec.directory_id ; - - FOR dc_rec IN (SELECT dc.ancestor_id as dir_id FROM lta.directory_closure dc WHERE dc.descendant_id = touched_dir_rec.directory_id) LOOP - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.tree_stats (tree_root_directory_id) - (SELECT dc_rec.dir_id WHERE NOT EXISTS (SELECT tree_root_directory_id FROM metainfo.tree_stats WHERE tree_root_directory_id = dc_rec.dir_id)); - - DROP TABLE IF EXISTS temp_fileinfo_for_treestats; - - CREATE TEMPORARY TABLE temp_fileinfo_for_treestats - AS (SELECT fi.size, fi.creation_date - FROM lta.directory_closure dc - INNER JOIN lta.fileinfo fi ON fi.directory_id = dc.descendant_id - WHERE dc.ancestor_id = dc_rec.dir_id) ; - - UPDATE metainfo.tree_stats SET - num_files=(SELECT count(size) FROM temp_fileinfo_for_treestats), - total_file_size=(SELECT sum(size) FROM temp_fileinfo_for_treestats), - min_file_size=(SELECT min(size) FROM temp_fileinfo_for_treestats), - max_file_size=(SELECT max(size) FROM temp_fileinfo_for_treestats), - min_file_creation_date=(SELECT min(creation_date) FROM temp_fileinfo_for_treestats), - max_file_creation_date=(SELECT max(creation_date) FROM temp_fileinfo_for_treestats) - WHERE tree_root_directory_id = dc_rec.dir_id ; - END LOOP; - END LOOP; - - TRUNCATE lta._changed_fileinfo_cache; - - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_fileinfo_inserted_add_directory_stats - AFTER INSERT - ON lta.fileinfo - FOR EACH STATEMENT - EXECUTE PROCEDURE lta.on_fileinfo_inserted_add_directory_stats(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_directory_inserted_parse_project_info() - RETURNS trigger AS -$BODY$ -DECLARE -project_pos int; -next_slash_pos int; -new_dir_name text; -dir_name_tail text; -new_project_name text; -new_project_id int; -project_dir_name text; -project_dir_id int; -obs_id int; -obs_dir_name text; -BEGIN - new_dir_name := trim(trailing '/' from NEW.name); - project_pos := strpos(new_dir_name, '/projects'); - IF project_pos > 0 THEN - dir_name_tail := substring(new_dir_name from project_pos + 10); - IF length(dir_name_tail) > 0 THEN - next_slash_pos := strpos(dir_name_tail, '/'); - IF next_slash_pos > 0 THEN - new_project_name := substring(dir_name_tail from 0 for next_slash_pos); - ELSE - new_project_name := dir_name_tail; - END IF; - - IF length(new_project_name) > 0 THEN - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.project(name) - (SELECT new_project_name WHERE NOT EXISTS (SELECT name FROM metainfo.project WHERE name = new_project_name)); - - SELECT id FROM metainfo.project WHERE name = new_project_name LIMIT 1 INTO new_project_id; - - IF new_project_id IS NOT NULL THEN - IF next_slash_pos > 0 THEN - project_dir_name := substring(new_dir_name from 0 for project_pos + 10 + next_slash_pos - 1); - ELSE - project_dir_name := new_dir_name; - END IF; - - IF project_dir_name = new_dir_name THEN - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.project_top_level_directory(project_id, directory_id) - (SELECT new_project_id, NEW.id WHERE NOT EXISTS (SELECT ptld.project_id, ptld.directory_id FROM metainfo.project_top_level_directory ptld WHERE ptld.project_id = new_project_id AND ptld.directory_id = NEW.id)); - ELSE - dir_name_tail := substring(dir_name_tail from length(new_project_name)+2); - next_slash_pos := strpos(dir_name_tail, '/'); - IF next_slash_pos > 0 THEN - obs_dir_name := substring(dir_name_tail from 0 for next_slash_pos); - ELSE - obs_dir_name := dir_name_tail; - END IF; - BEGIN - obs_id := obs_dir_name::integer; - - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.observation(id) - (SELECT obs_id WHERE NOT EXISTS (SELECT id FROM metainfo.observation WHERE id = obs_id)); - - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.project_observation(project_id, observation_id) - (SELECT new_project_id, obs_id WHERE NOT EXISTS (SELECT project_id, observation_id FROM metainfo.project_observation WHERE project_id = new_project_id AND observation_id = obs_id)); - EXCEPTION WHEN invalid_text_representation THEN - END; - END IF; - END IF; - END IF; - END IF; - END IF; - - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_directory_inserted_parse_project_info - AFTER INSERT - ON lta.directory - FOR EACH ROW - EXECUTE PROCEDURE lta.on_directory_inserted_parse_project_info(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_fileinfo_inserted_parse_observation_info() - RETURNS trigger AS -$BODY$ -DECLARE -new_file_name text; -L_pos int; -first_underscore_pos int; -last_underscore_pos int; -obs_id int; -dataproduct_name text; -BEGIN - new_file_name := trim(leading '/' from NEW.name); - L_pos := strpos(new_file_name, 'L'); - first_underscore_pos := strpos(new_file_name, '_'); - IF L_pos > 0 AND first_underscore_pos > L_pos THEN - BEGIN - obs_id := substring(new_file_name from L_pos+1 for first_underscore_pos-2)::integer; - - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.observation(id) - (SELECT obs_id WHERE NOT EXISTS (SELECT id FROM metainfo.observation WHERE id = obs_id)); - - last_underscore_pos := length(new_file_name) - strpos(reverse(new_file_name), '_'); - IF last_underscore_pos > L_pos THEN - dataproduct_name := substring(new_file_name from L_pos for last_underscore_pos); - - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO metainfo.dataproducts(fileinfo_id, observation_id, name) - (SELECT NEW.id, obs_id, dataproduct_name WHERE NOT EXISTS (SELECT fileinfo_id, observation_id, name FROM metainfo.dataproducts WHERE fileinfo_id = NEW.id AND observation_id = obs_id AND name = dataproduct_name)); - END IF; - - EXCEPTION WHEN invalid_text_representation THEN - END; - END IF; - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_fileinfo_inserted_parse_observation_info - AFTER INSERT - ON lta.fileinfo - FOR EACH ROW - EXECUTE PROCEDURE lta.on_fileinfo_inserted_parse_observation_info(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_directory_inserted_add_scraperlast_directory_visit() - RETURNS trigger AS -$BODY$ -DECLARE -BEGIN - --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING - INSERT INTO scraper.last_directory_visit(directory_id, visit_date) - (SELECT NEW.id, '1970-01-01' WHERE NOT EXISTS (SELECT id FROM scraper.last_directory_visit WHERE directory_id = NEW.id)); - - RETURN NEW; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_directory_inserted_add_scraperlast_directory_visit - AFTER INSERT - ON lta.directory - FOR EACH ROW - EXECUTE PROCEDURE lta.on_directory_inserted_add_scraperlast_directory_visit(); - --------------------------------------------------------------------------------- - -CREATE OR REPLACE FUNCTION lta.on_site_deleted_delete_site_directories() - RETURNS trigger AS -$BODY$ -BEGIN - DELETE FROM lta.directory WHERE id in (SELECT dir_id FROM lta.site_directory_tree WHERE site_id = OLD.ID); - - RETURN OLD; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -CREATE TRIGGER trigger_on_site_deleted_delete_site_directories - BEFORE DELETE - ON lta.storage_site - FOR EACH ROW - EXECUTE PROCEDURE lta.on_site_deleted_delete_site_directories(); - --------------------------------------------------------------------------------- - --- END TRIGGERS - - --- VIEWS - -CREATE VIEW lta.root_directories as - select dir.id as dir_id, dir.name as dir_name, ss.id as site_id, ss.name as site_name - from lta.storage_site_root_dir ssrd - join lta.directory dir on dir.id = ssrd.directory_id - join lta.storage_site ss on ss.id = ssrd.storage_site_id ; - -CREATE VIEW lta.site_directory_tree as - select rootdir.site_id as site_id, - rootdir.site_name as site_name, - rootdir.dir_id as rootdir_id, - rootdir.dir_name as rootdir_name, - dir.id as dir_id, - dir.name as dir_name, - dir.parent_directory_id as parent_directory_id, - dc.depth as depth - from lta.root_directories rootdir - inner join lta.directory_closure dc on dc.ancestor_id = rootdir.dir_id - inner join lta.directory dir on dc.descendant_id = dir.id - order by rootdir_id, depth; - -CREATE VIEW scraper.site_scraper_last_directoy_visit as - select rootdir.site_id as site_id, - rootdir.site_name as site_name, - dir.id as dir_id, - dir.name as dir_name, - sldv.visit_date as last_visit - from lta.root_directories rootdir - inner join lta.directory_closure dc on dc.ancestor_id = rootdir.dir_id - inner join lta.directory dir on dc.descendant_id = dir.id - inner join scraper.last_directory_visit sldv on sldv.directory_id = dir.id ; - -CREATE VIEW lta.site_directory_file as - select site.id as site_id, - site.name as site_name, - dir.id as dir_id, - dir.name as dir_name, - fileinfo.id as file_id, - fileinfo.name as file_name, - fileinfo.size as file_size, - fileinfo.creation_date as file_creation_date - from lta.storage_site site - join lta.storage_site_root_dir ssrd on ssrd .storage_site_id = site.id - inner join lta.directory_closure dc on dc.ancestor_id = ssrd .directory_id - inner join lta.directory dir on dc.descendant_id = dir.id - inner join lta.fileinfo on fileinfo.directory_id = dir.id ; - -CREATE VIEW metainfo.project_directory as - select - project.id as project_id, - project.name as project_name, - dir.id as dir_id, - dir.name as dir_name - from metainfo.project_top_level_directory ptld - inner join metainfo.project on project.id = ptld.project_id - inner join lta.directory_closure dc on dc.ancestor_id = ptld.directory_id - inner join lta.directory dir on dc.descendant_id = dir.id ; - -CREATE VIEW metainfo.project_directory_stats as - select * from metainfo.project_directory pd - inner join metainfo.directory_stats ds on ds.directory_id = pd.dir_id ; - -CREATE VIEW metainfo.site_directory_stats as - select sdt.site_id, - sdt.site_name, - sdt.dir_id, - sdt.dir_name, - ds.num_files as dir_num_files, - ds.total_file_size as dir_total_file_size, - ds.min_file_size as dir_min_file_size, - ds.max_file_size as dir_max_file_size, - ds.min_file_creation_date as dir_min_file_creation_date, - ds.max_file_creation_date as dir_max_file_creation_date, - ts.num_files as tree_num_files, - ts.total_file_size as tree_total_file_size, - ts.min_file_size as tree_min_file_size, - ts.max_file_size as tree_max_file_size, - ts.min_file_creation_date as tree_min_file_creation_date, - ts.max_file_creation_date as tree_max_file_creation_date - from lta.site_directory_tree sdt - left join metainfo.directory_stats ds on ds.directory_id = sdt.dir_id - left join metainfo.tree_stats ts on ts.tree_root_directory_id = sdt.dir_id - order by rootdir_id, depth; - - -CREATE VIEW metainfo.project_observation_dataproducts as - SELECT p.id AS project_id, - p.name AS project_name, - dp.observation_id, - dp.id AS dataproduct_id, - dp.name AS dataproduct_name, - dp.fileinfo_id AS fileinfo_id - FROM metainfo.dataproducts dp - LEFT JOIN metainfo.project_observation po ON po.observation_id = dp.observation_id - LEFT JOIN metainfo.project p ON p.id = po.project_id; - -CREATE VIEW metainfo.dataproducts_all as - SELECT pod.*, sdf.* - FROM metainfo.project_observation_dataproducts pod - INNER JOIN lta.site_directory_file sdf on sdf.file_id = pod.fileinfo_id; - - -CREATE VIEW metainfo.project_stats as - select ptld.project_id, p.name as project_name, site_id, site_name, dir_id, dir_name, tree_num_files, tree_total_file_size, tree_min_file_creation_date, tree_max_file_creation_date - from metainfo.project_top_level_directory ptld - inner join metainfo.project p on p.id = ptld.project_id - inner join metainfo.site_directory_stats sds on sds.dir_id = ptld.directory_id - where tree_num_files NOTNULL - order by name, site_name; - --- END VIEWS - -COMMIT; - --- insert into lta.storage_site (name, url) values ('sara', 'sara.nl'); --- insert into lta.directory (name, parent_directory_id) values ('/pnfs/grid.sara.nl/data/lofar/ops', null); --- insert into lta.directory (name, parent_directory_id) values ('/pnfs/grid.sara.nl/data/lofar/ops/lc1_001', 1); --- insert into lta.directory (name, parent_directory_id) values ('/pnfs/grid.sara.nl/data/lofar/ops/lc1_001/L123456', 2); --- insert into lta.storage_site_root_dir (storage_site_id, directory_id) values(1, 1); --- --- insert into lta.fileinfo (name, size, creation_date, directory_id) values ('foo', 1, '2000-01-01', 3); --- insert into lta.fileinfo (name, size, creation_date, directory_id) values ('bar', 1, '2000-01-01', 3); --- insert into lta.fileinfo (name, size, creation_date, directory_id) values ('baz', 10, '2001-01-01', 3); --- --- select * from lta.site_directory_tree; --- select * from lta.site_directory_file; --- select * from lta.site_directory_stats; --- --- insert into lta.fileinfo (name, size, creation_date, directory_id) values ('xyz', 1, '2000-01-01', 2); --- insert into lta.fileinfo (name, size, creation_date, directory_id) values ('qwe', 5, '2000-01-01', 1); --- select * from lta.site_directory_stats; diff --git a/LTA/ltastorageoverview/lib/ltaso/create_db_ltastorageoverview.sql b/LTA/ltastorageoverview/lib/ltaso/create_db_ltastorageoverview.sql new file mode 100644 index 00000000000..6fbf60f1cde --- /dev/null +++ b/LTA/ltastorageoverview/lib/ltaso/create_db_ltastorageoverview.sql @@ -0,0 +1,962 @@ +/* +# Copyright (C) 2012-2015 asTRON (Netherlands Institute for Radio Astronomy) +# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands +# +# This file is part of the LOFAR software suite. +# The LOFAR software suite is free software: you can redistribute it and/or +# modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# The LOFAR software suite is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. +*/ + +-- $Id$ + +-- postgresql create script for ltastorageoverview database + +-- run from command line as: +-- psql ltaso -f create_db_ltastorageoverview.sql -W + +-- \set VERBOSITY terse + +BEGIN; + +DROP SCHEMA IF EXISTS lta CASCADE; +DROP SCHEMA IF EXISTS scraper CASCADE; +DROP SCHEMA IF EXISTS metainfo CASCADE; + +CREATE SCHEMA lta; +CREATE SCHEMA scraper; +CREATE SCHEMA metainfo; + +-- TABLES + +CREATE TABLE lta.site ( + id serial, + name text UNIQUE NOT NULL, + url text UNIQUE NOT NULL, + PRIMARY KEY (id) +) WITH (OIDS=FALSE); + +CREATE INDEX ss_name_idx on lta.site(name); + +CREATE TABLE lta.directory ( + id serial, + name text NOT NULL, + parent_dir_id integer REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + PRIMARY KEY (id), + UNIQUE (name, parent_dir_id) +) WITH (OIDS=FALSE); + +CREATE INDEX d_parent_dir_id_idx on lta.directory(parent_dir_id); +CREATE INDEX d_name_idx on lta.directory(name); + +CREATE TABLE lta.directory_closure ( + ancestor_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + descendant_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + depth integer NOT NULL, + primary key (ancestor_id, descendant_id) +) WITH (OIDS=FALSE); + +CREATE INDEX dc_ancestor_id_idx on lta.directory_closure(ancestor_id); +CREATE INDEX dc_descendant_id_idx on lta.directory_closure(descendant_id); +CREATE INDEX dc_depth_idx on lta.directory_closure(depth); + +CREATE TABLE lta.fileinfo ( + id serial, + name text NOT NULL, + size bigint NOT NULL, + creation_date timestamp without time zone NOT NULL, + dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + PRIMARY KEY (id), + UNIQUE (name, dir_id) +) WITH (OIDS=FALSE); + +CREATE INDEX fi_dir_id_idx on lta.fileinfo(dir_id); +CREATE INDEX fi_creation_date_idx on lta.fileinfo(creation_date); +CREATE INDEX fi_name_idx on lta.fileinfo(name); + +CREATE TABLE lta.site_root_dir ( + site_id integer NOT NULL REFERENCES lta.site ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + root_dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + primary key (site_id, root_dir_id) +) WITH (OIDS=FALSE); + +CREATE INDEX ssr_site_id_idx on lta.site_root_dir(site_id); +CREATE INDEX ssr_root_dir_id_idx on lta.site_root_dir(root_dir_id); + +CREATE TABLE lta.site_quota ( + id serial, + site_id integer NOT NULL REFERENCES lta.site ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + quota bigint NOT NULL, + valid_until_date timestamp without time zone NOT NULL, + primary key (id) +) WITH (OIDS=FALSE); + +CREATE TABLE lta.quota_root_dirs ( + site_id integer NOT NULL REFERENCES lta.site ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + root_dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + primary key (site_id, root_dir_id) +); + +CREATE TABLE lta._directory_update_cache ( + dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + PRIMARY KEY (dir_id) +) WITH (OIDS=FALSE); + +CREATE TABLE scraper.last_directory_visit ( + id serial, + dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + visit_date timestamp without time zone NOT NULL DEFAULT '1970-01-01', + PRIMARY KEY (id) +) WITH (OIDS=FALSE); + +CREATE INDEX ldv_dir_id_idx on scraper.last_directory_visit(dir_id); +CREATE INDEX ldv_visit_date_idx on scraper.last_directory_visit(visit_date); + +CREATE TABLE metainfo.stats ( + id serial, + dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + dir_num_files integer DEFAULT 0 NOT NULL, + dir_total_file_size bigint DEFAULT 0 NOT NULL, + dir_min_file_size bigint DEFAULT 0 NOT NULL, + dir_max_file_size bigint DEFAULT 0 NOT NULL, + dir_min_file_creation_date timestamp without time zone DEFAULT NULL, + dir_max_file_creation_date timestamp without time zone DEFAULT NULL, + tree_num_files integer DEFAULT 0 NOT NULL, + tree_total_file_size bigint DEFAULT 0 NOT NULL, + tree_min_file_size bigint DEFAULT NULL, + tree_max_file_size bigint DEFAULT NULL, + tree_min_file_creation_date timestamp without time zone DEFAULT NULL, + tree_max_file_creation_date timestamp without time zone DEFAULT NULL, + PRIMARY KEY (id) +); + +CREATE INDEX stats_dir_id_idx on metainfo.stats(dir_id); +CREATE INDEX stats_dir_min_file_creation_date_idx on metainfo.stats(dir_min_file_creation_date); +CREATE INDEX stats_dir_max_file_creation_date_idx on metainfo.stats(dir_max_file_creation_date); +CREATE INDEX stats_tree_min_file_creation_date_idx on metainfo.stats(tree_min_file_creation_date); +CREATE INDEX stats_tree_max_file_creation_date_idx on metainfo.stats(tree_max_file_creation_date); + +CREATE TABLE metainfo.project ( + id serial, + name text UNIQUE NOT NULL, + PRIMARY KEY (id) +) WITH (OIDS=FALSE); + +CREATE INDEX project_name_idx on metainfo.project(name); + +CREATE TABLE metainfo.project_top_level_directory ( + project_id integer NOT NULL REFERENCES metainfo.project ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + dir_id integer NOT NULL REFERENCES lta.directory ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + primary key (project_id, dir_id) +) WITH (OIDS=FALSE); + +CREATE INDEX ptld_project_id_idx on metainfo.project_top_level_directory(project_id); +CREATE INDEX ptld_dir_id_idx on metainfo.project_top_level_directory(dir_id); + +CREATE TABLE metainfo.observation ( + id int, -- sas id, like 'L123456', but then as integer, so 123456 + PRIMARY KEY (id) +) WITH (OIDS=FALSE); + +CREATE TABLE metainfo.project_observation ( + project_id integer NOT NULL REFERENCES metainfo.project ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + observation_id integer NOT NULL REFERENCES metainfo.observation ON DELETE CASCADE DEFERRABLE INITIALLY IMMEDIATE, + PRIMARY KEY (project_id, observation_id) +) WITH (OIDS=FALSE); + +CREATE TABLE metainfo.dataproduct ( + id serial, + fileinfo_id integer NOT NULL REFERENCES lta.fileinfo ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + observation_id integer NOT NULL REFERENCES metainfo.observation ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + name text NOT NULL, + PRIMARY KEY (id) +) WITH (OIDS=FALSE); + +CREATE INDEX dp_dataproduct_name_idx on metainfo.dataproduct(name); +CREATE INDEX dp_fileinfo_id_idx on metainfo.dataproduct(fileinfo_id); + +-- END TABLES + + +-- TRIGGERS + +CREATE OR REPLACE FUNCTION lta.on_site_root_dir_deleted_do_delete_directory() +RETURNS trigger AS +$BODY$ +BEGIN + DELETE FROM lta.directory WHERE id = OLD.root_dir_id; + RETURN OLD; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_site_root_dir_deleted_do_delete_directory +AFTER DELETE +ON lta.site_root_dir +FOR EACH ROW +EXECUTE PROCEDURE lta.on_site_root_dir_deleted_do_delete_directory(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION lta.on_directory_inserted_add_directory_closure_entry() +RETURNS trigger AS +$BODY$ +BEGIN + INSERT INTO lta.directory_closure (ancestor_id, descendant_id, depth) values (NEW.id, NEW.id, 0) ; + + INSERT INTO lta.directory_closure (ancestor_id, descendant_id, depth) + SELECT p.ancestor_id, c.descendant_id, p.depth+c.depth+1 + FROM lta.directory_closure p, lta.directory_closure c + WHERE p.descendant_id=new.parent_dir_id AND c.ancestor_id=new.id ; + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_directory_inserted_add_directory_closure_entry +AFTER INSERT +ON lta.directory +FOR EACH ROW +EXECUTE PROCEDURE lta.on_directory_inserted_add_directory_closure_entry(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION scraper.on_directory_inserted_add_last_directory_visit_entry() +RETURNS trigger AS +$BODY$ +BEGIN + --RAISE NOTICE 'on_directory_inserted_add_last_directory_visit_entry, NEW=%', NEW; + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO scraper.last_directory_visit(dir_id) + (SELECT NEW.id WHERE NOT EXISTS (SELECT dir_id FROM scraper.last_directory_visit WHERE dir_id = NEW.id)); + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_directory_inserted_add_last_directory_visit_entry +AFTER INSERT +ON lta.directory +FOR EACH ROW +EXECUTE PROCEDURE scraper.on_directory_inserted_add_last_directory_visit_entry(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION scraper.on_site_root_dir_inserted_do_add_to_quota_root_dirs() +RETURNS trigger AS +$BODY$ +BEGIN + -- by default, add each root directory as 'directory under quota' + -- users can remove them by hand + INSERT INTO lta.quota_root_dirs(site_id, root_dir_id) + VALUES (NEW.site_id, NEW.root_dir_id); + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_site_root_dir_inserted_do_add_to_quota_root_dirs +AFTER INSERT +ON lta.site_root_dir +FOR EACH ROW +EXECUTE PROCEDURE scraper.on_site_root_dir_inserted_do_add_to_quota_root_dirs(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION metainfo.on_directory_inserted_add_stats_entry() +RETURNS trigger AS +$BODY$ +BEGIN + --RAISE NOTICE 'on_directory_inserted_add_stats_entry, NEW=%', NEW; + INSERT INTO metainfo.stats(dir_id) values (NEW.id); + + -- always trim trailing slashes from dirname + NEW.name := trim(trailing '/' from NEW.name); + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_directory_inserted_add_stats_entry +BEFORE INSERT +ON lta.directory +FOR EACH ROW +EXECUTE PROCEDURE metainfo.on_directory_inserted_add_stats_entry(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION metainfo.on_fileinfo_insert_update_delete_store_in_cache() +RETURNS trigger AS +$BODY$ +BEGIN + --RAISE NOTICE 'on_fileinfo_insert_update_delete_store_in_cache, NEW=%', NEW; + + IF TG_OP = 'DELETE' THEN + INSERT INTO lta._directory_update_cache (dir_id) VALUES (OLD.dir_id); + RETURN OLD; + END IF; + + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO lta._directory_update_cache (dir_id) + (SELECT NEW.dir_id WHERE NOT EXISTS (SELECT dir_id FROM lta._directory_update_cache WHERE dir_id = NEW.dir_id)); + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_fileinfo_insert_update_delete_store_in_cache +AFTER INSERT OR UPDATE OR DELETE +ON lta.fileinfo +FOR EACH ROW +EXECUTE PROCEDURE metainfo.on_fileinfo_insert_update_delete_store_in_cache(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION metainfo.on_directory_update_cache_commit_do_update_dir_stats() +RETURNS trigger AS +$BODY$ +DECLARE + fileinfo_row lta.fileinfo%ROWTYPE; + _dir_id integer; + _dir_num_files bigint; + _dir_total_file_size bigint; + _dir_min_file_size bigint; + _dir_max_file_size bigint; + _dir_min_file_creation_date timestamp without time zone; + _dir_max_file_creation_date timestamp without time zone; +BEGIN +-- --RAISE NOTICE 'on_fileinfo_commit_do_update_dir_stats start'; + + FOR _dir_id in (SELECT DISTINCT(c.dir_id) FROM lta._directory_update_cache c) LOOP +-- --RAISE NOTICE 'on_fileinfo_commit_do_update_dir_stats dir_id=% #fi_in_cache=%', dir_id, (SELECT COUNT(*) FROM lta._directory_update_cache); + _dir_num_files := 0; + _dir_total_file_size := 0; + _dir_min_file_size := NULL; + _dir_max_file_size := NULL; + + -- aggregate results + FOR fileinfo_row IN (SELECT * FROM lta.fileinfo fi where fi.dir_id = _dir_id) LOOP + --RAISE NOTICE 'on_fileinfo_commit_do_update_dir_stats dir_id=% fileinfo=%', dir_id, fileinfo_row.name; + _dir_num_files := _dir_num_files + 1; + _dir_total_file_size := _dir_total_file_size + fileinfo_row.size; + _dir_min_file_size := LEAST(_dir_min_file_size, fileinfo_row.size); + _dir_max_file_size := GREATEST(_dir_max_file_size, fileinfo_row.size); + _dir_min_file_creation_date := LEAST(_dir_min_file_creation_date, fileinfo_row.creation_date); + _dir_max_file_creation_date := GREATEST(_dir_max_file_creation_date, fileinfo_row.creation_date); + END LOOP; + +-- --RAISE NOTICE 'on_fileinfo_commit_do_update_dir_stats dir_id=%, dir_num_files=%, dir_total_file_size=%, dir_min_file_size=%, dir_max_file_size=%', dir_id, _dir_num_files, _dir_total_file_size, _dir_min_file_size, _dir_max_file_size; + + UPDATE metainfo.stats + SET (dir_num_files, dir_total_file_size, dir_min_file_size, dir_max_file_size, dir_min_file_creation_date, dir_max_file_creation_date) = + (_dir_num_files, _dir_total_file_size, _dir_min_file_size, _dir_max_file_size, _dir_min_file_creation_date, _dir_max_file_creation_date) + WHERE dir_id = _dir_id; + + DELETE FROM lta._directory_update_cache WHERE dir_id = _dir_id; + END LOOP; + +-- --RAISE NOTICE 'on_fileinfo_commit_do_update_dir_stats end'; + RETURN NULL; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE CONSTRAINT TRIGGER trigger_on_directory_update_cache_commit_do_update_dir_stats +AFTER INSERT +ON lta._directory_update_cache +DEFERRABLE INITIALLY DEFERRED +FOR EACH ROW +EXECUTE PROCEDURE metainfo.on_directory_update_cache_commit_do_update_dir_stats(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION metainfo.on_dir_stats_update_do_update_tree_stats() +RETURNS trigger AS +$BODY$ +DECLARE + stats_row metainfo.stats%ROWTYPE; +BEGIN + --RAISE NOTICE 'on_dir_stats_update_do_update_tree_stats, NEW_before=%', NEW; + + -- initialize the NEW.tree_* variables with this dir's dir_stats... + NEW.tree_num_files := NEW.dir_num_files; + NEW.tree_total_file_size := NEW.dir_total_file_size; + NEW.tree_min_file_size := NEW.dir_min_file_size; + NEW.tree_max_file_size := NEW.dir_max_file_size; + NEW.tree_min_file_creation_date := NEW.dir_min_file_creation_date; + NEW.tree_max_file_creation_date := NEW.dir_max_file_creation_date; + + -- loop over the tree stats from all filled subdirs of this directory + -- and aggregate them to the new_tree_* variables + FOR stats_row IN SELECT st.* FROM metainfo.stats st + INNER JOIN lta.directory dir ON dir.id = st.dir_id + WHERE dir.parent_dir_id = NEW.dir_id + AND tree_max_file_creation_date IS NOT NULL + AND dir_max_file_creation_date IS NOT NULL LOOP + + --RAISE NOTICE 'on_dir_stats_update_do_update_tree_stats, subtree_stats_row=%', stats_row; + + -- aggregate + NEW.tree_num_files := NEW.tree_num_files + stats_row.tree_num_files; + NEW.tree_total_file_size := NEW.tree_total_file_size + stats_row.tree_total_file_size; + NEW.tree_min_file_size := LEAST(NEW.tree_min_file_size, stats_row.tree_min_file_size); + NEW.tree_max_file_size := GREATEST(NEW.tree_max_file_size, stats_row.tree_max_file_size); + NEW.tree_min_file_creation_date := LEAST(NEW.tree_min_file_creation_date, stats_row.tree_min_file_creation_date); + NEW.tree_max_file_creation_date := GREATEST(NEW.tree_max_file_creation_date, stats_row.tree_max_file_creation_date); + END LOOP; + + --RAISE NOTICE 'on_dir_stats_update_do_update_tree_stats, NEW_after =%', NEW; + + -- return the NEW row with the updated tree_* variables + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_dir_stats_update_do_update_tree_stats +BEFORE UPDATE OF dir_num_files, dir_total_file_size, dir_min_file_size, dir_max_file_size, dir_min_file_creation_date, dir_max_file_creation_date +ON metainfo.stats +FOR EACH ROW +EXECUTE PROCEDURE metainfo.on_dir_stats_update_do_update_tree_stats(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION metainfo.on_stats_update_do_update_parents_tree_stats() +RETURNS trigger AS +$BODY$ +DECLARE + stats_row metainfo.stats%ROWTYPE; + parent_stats_row metainfo.stats%ROWTYPE; + new_tree_num_files bigint; + new_tree_total_file_size bigint; + new_tree_min_file_size bigint; + new_tree_max_file_size bigint; + new_tree_min_file_creation_date timestamp without time zone; + new_tree_max_file_creation_date timestamp without time zone; +BEGIN + --RAISE NOTICE 'on_stats_update_do_update_parents_tree_stats begin, NEW=%', NEW; + + -- climb up the tree until at root, start with the first direct parent + SELECT st.* FROM metainfo.stats st + INNER JOIN lta.directory dir on dir.parent_dir_id = st.dir_id + WHERE dir.id = NEW.dir_id + LIMIT 1 + INTO parent_stats_row; + + --loop and climb further up the tree until at root + WHILE parent_stats_row.id IS NOT NULL LOOP + --RAISE NOTICE 'on_stats_update_do_update_parents_tree_stats, parent_stats_row=%', parent_stats_row; + + -- initialize all new_tree_* vars with the current parent_stats_row's values or 0/null. + new_tree_num_files := GREATEST(0, parent_stats_row.dir_num_files); + new_tree_total_file_size := GREATEST(0, parent_stats_row.dir_total_file_size); + new_tree_min_file_size := parent_stats_row.tree_min_file_size; + new_tree_max_file_size := parent_stats_row.tree_max_file_size; + new_tree_min_file_creation_date := parent_stats_row.tree_min_file_creation_date; + new_tree_max_file_creation_date := parent_stats_row.tree_max_file_creation_date; + + -- loop over the tree stats from all filled subdirs of the parent's directory + -- and aggregate them to the new_tree_* variables + FOR stats_row in SELECT st.* FROM metainfo.stats st + INNER JOIN lta.directory dir ON dir.id = st.dir_id + WHERE dir.parent_dir_id = parent_stats_row.dir_id LOOP + --RAISE NOTICE 'on_stats_update_do_update_parents_tree_stats, parent_subdir_stats_row=%', stats_row; + + -- aggregate + new_tree_num_files := new_tree_num_files + stats_row.tree_num_files; + new_tree_total_file_size := new_tree_total_file_size + stats_row.tree_total_file_size; + new_tree_min_file_size := LEAST(new_tree_min_file_size, stats_row.tree_min_file_size); + new_tree_max_file_size := GREATEST(new_tree_max_file_size, stats_row.tree_max_file_size); + new_tree_min_file_creation_date := LEAST(new_tree_min_file_creation_date, stats_row.tree_min_file_creation_date); + new_tree_max_file_creation_date := GREATEST(new_tree_max_file_creation_date, stats_row.tree_max_file_creation_date); + END LOOP; + + -- and update the parent stats row with the aggregated results + UPDATE metainfo.stats stats + SET (tree_num_files, tree_total_file_size, tree_min_file_size, tree_max_file_size, tree_min_file_creation_date, tree_max_file_creation_date) = + (new_tree_num_files, new_tree_total_file_size, new_tree_min_file_size, new_tree_max_file_size, new_tree_min_file_creation_date, new_tree_max_file_creation_date) + WHERE stats.dir_id = parent_stats_row.dir_id; + + -- climb the tree by selecting the parent's parent, and loop again. + SELECT st.* FROM metainfo.stats st + INNER JOIN lta.directory dir on dir.parent_dir_id = st.dir_id + WHERE dir.id = parent_stats_row.dir_id + LIMIT 1 + INTO parent_stats_row; + --RAISE NOTICE 'on_stats_update_do_update_parents_tree_stats, parents parent_stats_row=%', parent_stats_row; + END LOOP; + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_stats_update_do_update_parents_tree_stats +AFTER UPDATE OF dir_num_files, dir_total_file_size, dir_min_file_size, dir_max_file_size, dir_min_file_creation_date, dir_max_file_creation_date +ON metainfo.stats +FOR EACH ROW +EXECUTE PROCEDURE metainfo.on_stats_update_do_update_parents_tree_stats(); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION lta.on_directory_inserted_parse_project_info() +RETURNS trigger AS +$BODY$ +DECLARE + project_pos int; + next_slash_pos int; + new_dir_name text; + dir_name_tail text; + new_project_name text; + new_project_id int; + project_dir_name text; + project_dir_id int; + obs_id int; + obs_dir_name text; +BEGIN + new_dir_name := trim(trailing '/' from NEW.name); + project_pos := strpos(new_dir_name, '/projects'); + + IF project_pos > 0 THEN + dir_name_tail := substring(new_dir_name from project_pos + 10); + IF length(dir_name_tail) > 0 THEN + next_slash_pos := strpos(dir_name_tail, '/'); + IF next_slash_pos > 0 THEN + new_project_name := substring(dir_name_tail from 0 for next_slash_pos); + ELSE + new_project_name := dir_name_tail; + END IF; + + IF length(new_project_name) > 0 THEN + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.project(name) + (SELECT new_project_name WHERE NOT EXISTS (SELECT name FROM metainfo.project WHERE name = new_project_name)); + + SELECT id FROM metainfo.project WHERE name = new_project_name LIMIT 1 INTO new_project_id; + + IF new_project_id IS NOT NULL THEN + IF next_slash_pos > 0 THEN + project_dir_name := substring(new_dir_name from 0 for project_pos + 10 + next_slash_pos - 1); + ELSE + project_dir_name := new_dir_name; + END IF; + + IF project_dir_name = new_dir_name THEN + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.project_top_level_directory(project_id, dir_id) + (SELECT new_project_id, NEW.id WHERE NOT EXISTS (SELECT ptld.project_id, ptld.dir_id FROM metainfo.project_top_level_directory ptld WHERE ptld.project_id = new_project_id AND ptld.dir_id = NEW.id)); + ELSE + dir_name_tail := substring(dir_name_tail from length(new_project_name)+2); + next_slash_pos := strpos(dir_name_tail, '/'); + IF next_slash_pos > 0 THEN + obs_dir_name := substring(dir_name_tail from 0 for next_slash_pos); + ELSE + obs_dir_name := dir_name_tail; + END IF; + BEGIN + obs_id := obs_dir_name::integer; + + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.observation(id) + (SELECT obs_id WHERE NOT EXISTS (SELECT id FROM metainfo.observation WHERE id = obs_id)); + + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.project_observation(project_id, observation_id) + (SELECT new_project_id, obs_id WHERE NOT EXISTS (SELECT project_id, observation_id FROM metainfo.project_observation WHERE project_id = new_project_id AND observation_id = obs_id)); + EXCEPTION WHEN invalid_text_representation THEN + END; + END IF; + END IF; + END IF; + END IF; + END IF; + + RETURN NEW; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE +COST 100; + +CREATE TRIGGER trigger_on_directory_inserted_parse_project_info +AFTER INSERT +ON lta.directory +FOR EACH ROW +EXECUTE PROCEDURE lta.on_directory_inserted_parse_project_info(); + +-------------------------------------------------------------------------------- + + CREATE OR REPLACE FUNCTION lta.on_fileinfo_inserted_parse_observation_info() + RETURNS trigger AS + $BODY$ + DECLARE + new_file_name text; + L_pos int; + first_underscore_pos int; + first_dot_pos int; + obs_id int; + dataproduct_name text; + BEGIN + new_file_name := trim(leading '/' from NEW.name); + L_pos := strpos(new_file_name, 'L'); + first_underscore_pos := strpos(new_file_name, '_'); + IF L_pos > 0 AND first_underscore_pos > L_pos THEN + BEGIN + obs_id := substring(new_file_name from L_pos+1 for first_underscore_pos-2)::integer; + + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.observation(id) + (SELECT obs_id WHERE NOT EXISTS (SELECT id FROM metainfo.observation WHERE id = obs_id)); + + first_dot_pos := strpos(new_file_name, '.'); + IF first_dot_pos > L_pos THEN + dataproduct_name := substring(new_file_name from L_pos for first_dot_pos-1); + + --postgres < 9.5 way of doing INSERT...ON CONFLICT DO NOTHING + INSERT INTO metainfo.dataproduct(fileinfo_id, observation_id, name) + (SELECT NEW.id, obs_id, dataproduct_name WHERE NOT EXISTS (SELECT fileinfo_id, observation_id, name FROM metainfo.dataproduct WHERE fileinfo_id = NEW.id AND observation_id = obs_id AND name = dataproduct_name)); + END IF; + + EXCEPTION WHEN invalid_text_representation THEN + END; + END IF; + RETURN NEW; + END; + $BODY$ + LANGUAGE plpgsql VOLATILE + COST 100; + +CREATE TRIGGER trigger_on_fileinfo_inserted_parse_observation_info + AFTER INSERT + ON lta.fileinfo + FOR EACH ROW + EXECUTE PROCEDURE lta.on_fileinfo_inserted_parse_observation_info(); + +-- END TRIGGERS + + +-- BEGIN NORMAL FUNCTIONS + +CREATE OR REPLACE FUNCTION metainfo.get_tree_stats(tree_root_dir_id integer, lower_ts timestamp without time zone DEFAULT NULL, upper_ts timestamp without time zone DEFAULT NULL, + OUT tree_num_files bigint, OUT tree_total_file_size bigint) +RETURNS record AS $$ +DECLARE + stats_row metainfo.stats%ROWTYPE; + subdir_id integer; + subdir_tree_num_files bigint; + subdir_tree_total_file_size bigint; +BEGIN + -- check for valid lower_ts/upper_ts + IF lower_ts IS NULL THEN + lower_ts := '-infinity'; + END IF; + IF upper_ts IS NULL THEN + upper_ts := 'infinity'; + END IF; + + SELECT st.* FROM metainfo.stats st + WHERE st.dir_id = tree_root_dir_id + LIMIT 1 + INTO stats_row; + + -- directory has no tree stats. So return 0,0 + IF stats_row.tree_min_file_creation_date IS NULL OR stats_row.tree_max_file_creation_date IS NULL THEN + tree_num_files := 0; + tree_total_file_size := 0; + RETURN; + END IF; + + + -- the tree stats of this directory have no overlap at all for the requested timerange + IF (stats_row.tree_min_file_creation_date > upper_ts) OR (stats_row.tree_max_file_creation_date < lower_ts) THEN + tree_num_files := 0; + tree_total_file_size := 0; + RETURN; + END IF; + + -- the tree stats of this directory have full overlap the requested timerange + IF stats_row.tree_min_file_creation_date >= lower_ts AND stats_row.tree_max_file_creation_date <= upper_ts THEN + tree_num_files := stats_row.tree_num_files; + tree_total_file_size := stats_row.tree_total_file_size; + RETURN; + END IF; + + -- the tree stats of this directory have partial overlap the requested timerange + -- recurse into subdirectories, and accumulate subdir results + IF stats_row.tree_min_file_creation_date <= upper_ts OR stats_row.tree_max_file_creation_date >= lower_ts THEN + -- start with the num_files and total_file_size in this dir... + IF stats_row.dir_num_files = 0 THEN + tree_num_files := 0; + tree_total_file_size := 0; + ELSIF stats_row.dir_min_file_creation_date >= lower_ts AND stats_row.dir_max_file_creation_date <= upper_ts THEN + -- all files in this dir are in the requested time range + -- when 'all files'=0, that's ok, cause then dir_num_files and dir_total_file_size are 0 which is the answer we need + tree_num_files := stats_row.dir_num_files; + tree_total_file_size := stats_row.dir_total_file_size; + ELSE + -- some files in this dir are in the requested time range + -- make selection of files in this dir in the requested time range + SELECT COUNT(fi.id), SUM(fi.size) FROM lta.fileinfo fi + WHERE fi.dir_id = tree_root_dir_id + AND fi.creation_date >= lower_ts AND fi.creation_date <= upper_ts + INTO tree_num_files, tree_total_file_size; + + IF tree_num_files = 0 THEN + tree_total_file_size := 0; + END IF; + END IF; + + --...and add all results from the subdirs which have at least partial overlap + FOR subdir_id in (SELECT d.id FROM lta.directory d + INNER JOIN metainfo.stats s on s.dir_id = d.id + WHERE d.parent_dir_id = tree_root_dir_id + AND NOT (s.tree_min_file_creation_date > upper_ts OR s.tree_max_file_creation_date < lower_ts)) LOOP + SELECT gts.tree_num_files, gts.tree_total_file_size FROM metainfo.get_tree_stats(subdir_id, lower_ts, upper_ts) as gts INTO subdir_tree_num_files, subdir_tree_total_file_size; + tree_num_files := tree_num_files + subdir_tree_num_files; + tree_total_file_size := tree_total_file_size + subdir_tree_total_file_size; + END LOOP; + RETURN; + END IF; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION metainfo.get_site_stats(_site_id integer, lower_ts timestamp without time zone DEFAULT NULL::timestamp without time zone, upper_ts timestamp without time zone DEFAULT NULL::timestamp without time zone, + OUT tree_num_files bigint, OUT tree_total_file_size bigint) + RETURNS record AS $$ +BEGIN + SELECT SUM(gts.tree_num_files), SUM(gts.tree_total_file_size) + FROM lta.site_root_dir srd, metainfo.get_tree_stats(srd.root_dir_id, lower_ts, upper_ts) gts + WHERE srd.site_id = _site_id + INTO tree_num_files, tree_total_file_size; +END; +$$ LANGUAGE plpgsql; + + +-- END NORMAL FUNCTIONS + +-- +-- +-- -- VIEWS + +CREATE VIEW lta.site_root_directory as + select ss.id as site_id, ss.name as site_name, srd.root_dir_id, dir.name as dir_name + from lta.site_root_dir srd + join lta.directory dir on dir.id = srd.root_dir_id + join lta.site ss on ss.id = srd.site_id ; + +CREATE VIEW lta.site_quota_view as + select ss.id as site_id, ss.name as site_name, ssq.quota, ssq.valid_until_date + from lta.site ss + left join lta.site_quota ssq on ssq.site_id = ss.id; + +CREATE VIEW lta.site_quota_root_directory as + SELECT s.id AS site_id, s.name AS site_name, d.id AS dir_id, d.name AS dir_name + FROM lta.quota_root_dirs qrd + JOIN lta.site s ON s.id = qrd.site_id + JOIN lta.directory d ON d.id = qrd.root_dir_id; + +CREATE VIEW lta.site_directory_tree as + select rd.site_id as site_id, + rd.site_name as site_name, + rd.root_dir_id as root_dir_id, + rd.dir_name as root_dir_name, + dir.id as dir_id, + dir.name as dir_name, + dir.parent_dir_id as parent_dir_id, + dc.depth as depth + from lta.site_root_directory rd + inner join lta.directory_closure dc on dc.ancestor_id = rd.root_dir_id + inner join lta.directory dir on dc.descendant_id = dir.id; + +CREATE VIEW scraper.site_scraper_last_directory_visit as + select rd.site_id as site_id, + rd.site_name as site_name, + dir.id as dir_id, + dir.name as dir_name, + sldv.visit_date as last_visit + from lta.site_root_directory rd + inner join lta.directory_closure dc on dc.ancestor_id = rd.root_dir_id + inner join lta.directory dir on dc.descendant_id = dir.id + inner join scraper.last_directory_visit sldv on sldv.dir_id = dir.id ; + +CREATE VIEW lta.site_directory_file as + select site.id as site_id, + site.name as site_name, + dir.id as dir_id, + dir.name as dir_name, + fileinfo.id as file_id, + fileinfo.name as file_name, + fileinfo.size as file_size, + fileinfo.creation_date as file_creation_date + from lta.site site + join lta.site_root_dir srd on srd.site_id = site.id + inner join lta.directory_closure dc on dc.ancestor_id = srd.root_dir_id + inner join lta.directory dir on dc.descendant_id = dir.id + inner join lta.fileinfo on fileinfo.dir_id = dir.id ; + +CREATE VIEW metainfo.project_directory as + select + project.id as project_id, + project.name as project_name, + dir.id as dir_id, + dir.name as dir_name + from metainfo.project_top_level_directory ptld + inner join metainfo.project on project.id = ptld.project_id + inner join lta.directory_closure dc on dc.ancestor_id = ptld.dir_id + inner join lta.directory dir on dc.descendant_id = dir.id ; + +CREATE VIEW metainfo.site_directory_stats as + select sdt.site_id, + sdt.site_name, + sdt.dir_id, + sdt.dir_name, + st.dir_num_files, + st.dir_total_file_size, + st.dir_min_file_size, + st.dir_max_file_size, + st.dir_min_file_creation_date, + st.dir_max_file_creation_date, + st.tree_num_files, + st.tree_total_file_size, + st.tree_min_file_size, + st.tree_max_file_size, + st.tree_min_file_creation_date, + st.tree_max_file_creation_date + from lta.site_directory_tree sdt + left join metainfo.stats st on st.dir_id = sdt.dir_id; + +CREATE OR REPLACE VIEW metainfo.project_directory_stats AS + SELECT pd.project_id, pd.project_name, sds.* + FROM metainfo.project_directory pd + JOIN metainfo.site_directory_stats sds ON sds.dir_id = pd.dir_id; + +CREATE VIEW metainfo.observation_dataproduct_file as + SELECT sdf.site_id, sdf.site_name, dp.observation_id, dp.id as dataproduct_id, dp.name as dataproduct_name, sdf.dir_id, sdf.dir_name, sdf.file_id, sdf.file_name, sdf.file_size, sdf.file_creation_date + FROM metainfo.dataproduct dp + JOIN lta.site_directory_file sdf ON sdf.file_id = dp.fileinfo_id; + +CREATE VIEW metainfo.project_observation_dataproduct as + SELECT p.id AS project_id, + p.name AS project_name, + dp.observation_id, + dp.id AS dataproduct_id, + dp.name AS dataproduct_name, + dp.fileinfo_id AS fileinfo_id + FROM metainfo.dataproduct dp + INNER JOIN metainfo.project_observation po ON po.observation_id = dp.observation_id + INNER JOIN metainfo.project p ON p.id = po.project_id; + +CREATE VIEW metainfo.dataproduct_all as + SELECT pod.*, sdf.* + FROM metainfo.project_observation_dataproduct pod + INNER JOIN lta.site_directory_file sdf on sdf.file_id = pod.fileinfo_id; + +CREATE VIEW metainfo.site_project_stats as + select ptld.project_id, p.name as project_name, site_id, site_name, sds.dir_id, sds.dir_name, tree_num_files, tree_total_file_size, tree_min_file_creation_date, tree_max_file_creation_date + from metainfo.project_top_level_directory ptld + inner join metainfo.project p on p.id = ptld.project_id + inner join metainfo.site_directory_stats sds on sds.dir_id = ptld.dir_id + where tree_num_files IS NOT NULL; + +CREATE VIEW metainfo.project_stats AS + SELECT project_id, project_name, COUNT(site_id) num_sites, SUM(tree_num_files) total_num_files, SUM(tree_total_file_size) total_file_size, MIN(tree_min_file_creation_date) min_file_creation_date, MAX(tree_max_file_creation_date) max_file_creation_date + FROM metainfo.site_project_stats + group by project_id, project_name; + +CREATE VIEW metainfo.site_project_observation_dataproduct_dir_file AS + SELECT sdf.site_id, sdf.site_name, pod.project_id, pod.project_name, pod.observation_id, pod.dataproduct_id, pod.dataproduct_name, sdf.dir_id, sdf.dir_name, sdf.file_id, sdf.file_name, sdf.file_size, sdf.file_creation_date + FROM metainfo.project_observation_dataproduct pod + JOIN lta.site_directory_file sdf ON sdf.file_id = pod.fileinfo_id; + +CREATE VIEW metainfo.site_root_dir_tree_stats AS + SELECT srd.site_id, srd.site_name, srd.root_dir_id as root_dir_id, srd.dir_name as root_dir_name, + sds.tree_num_files, sds.tree_total_file_size, sds.tree_min_file_size, sds.tree_max_file_size, sds.tree_min_file_creation_date, sds.tree_max_file_creation_date + FROM lta.site_root_directory srd + INNER JOIN metainfo.site_directory_stats sds ON sds.dir_id = srd.root_dir_id; + +CREATE VIEW metainfo.site_stats as + SELECT site_id, site_name, SUM(tree_num_files) total_num_files, SUM(tree_total_file_size) total_file_size, MIN(tree_min_file_size) min_file_size, MAX(tree_max_file_size) max_file_size, MIN(tree_min_file_creation_date) min_file_creation_date, MAX(tree_max_file_creation_date) max_file_creation_date + from metainfo.site_root_dir_tree_stats + group by site_id, site_name; + +CREATE VIEW metainfo.site_quota_usage AS + SELECT ss.site_id, + ss.site_name, + sq.quota, + (SELECT gts.tree_total_file_size FROM metainfo.get_site_stats(ss.site_id, NULL, sq.valid_until_date) gts) as total_file_size, + sq.quota::bigint - (SELECT gts.tree_total_file_size FROM metainfo.get_site_stats(ss.site_id, NULL, sq.valid_until_date) gts)::bigint AS space_left, + (SELECT gts.tree_num_files FROM metainfo.get_site_stats(ss.site_id, NULL, sq.valid_until_date) gts) AS num_files, + sq.valid_until_date + FROM metainfo.site_stats ss + JOIN lta.site_quota sq ON sq.site_id = ss.site_id; + +CREATE OR REPLACE VIEW metainfo.site_quota_root_dir_stats AS + SELECT sds.site_id, sds.site_name, sds.dir_id, sds.dir_name, sds.tree_num_files, sds.tree_total_file_size + FROM lta.quota_root_dirs qrd + INNER JOIN metainfo.site_directory_stats sds on sds.dir_id = qrd.root_dir_id; + + +-- END VIEWS + +COMMIT; + +--insert into lta.site (name, url) VALUES ('site1', 'url1'); +--insert into lta.site (name, url) VALUES ('site2', 'url2'); +--insert into lta.directory (name) VALUES ('root_dir1'); +--insert into lta.directory (name) VALUES ('root_dir2a'); +--insert into lta.directory (name) VALUES ('root_dir2b'); +--insert into lta.site_root_dir (site_id, dir_id) VALUES (1,1); +--insert into lta.site_root_dir (site_id, dir_id) VALUES (2,2); +--insert into lta.site_root_dir (site_id, dir_id) VALUES (2,3); +--insert into lta.directory (name, parent_dir_id) VALUES ('subdir', 1); +--insert into lta.directory (name, parent_dir_id) VALUES ('subdir', 2); +--insert into lta.directory (name, parent_dir_id) VALUES ('subsubdirA', 3); +--insert into lta.directory (name, parent_dir_id) VALUES ('subsubdirB', 3); +--insert into lta.directory (name, parent_dir_id) VALUES ('subsubsubdirA', 4); +-- +----insert into lta.fileinfo (name, size, creation_date, dir_id) VALUES ('file1', 100, '2017-01-01', 7); +----select * from metainfo.stats order by dir_id asc; +-- +--BEGIN; +--insert into lta.fileinfo (name, size, creation_date, dir_id) +--VALUES ('file2', 110, '2017-01-02', 2),('file3', 120, '2017-01-03', 3), ('file4', 130, '2017-01-04', 4), ('file5', 140, '2017-01-05', 4); +--COMMIT; +----select * from metainfo.stats order by dir_id asc; +---- +----insert into lta.fileinfo (name, size, creation_date, dir_id) VALUES ('file6', 150, '2017-01-06', 5); +------select * from metainfo.stats order by dir_id asc; +---- +--BEGIN; +--insert into lta.fileinfo (name, size, creation_date, dir_id) VALUES ('file7', 160, '2017-01-07', 8); +--insert into lta.fileinfo (name, size, creation_date, dir_id) VALUES ('file8', 180, '2017-01-08', 8); +--COMMIT; +-- +--select * from lta.site_directory_file order by dir_id asc, file_id asc; +--select * from metainfo.site_directory_stats; +-- +--SELECT metainfo.get_tree_stats(1, '-infinity'::timestamp, 'infinity'::timestamp); +--SELECT metainfo.get_tree_stats(1, '2017-01-03', 'infinity'::timestamp); +--SELECT metainfo.get_tree_stats(1, '2017-01-01', '2017-01-02'); +--SELECT metainfo.get_tree_stats(1, '2017-01-07', '2017-01-08'); diff --git a/LTA/ltastorageoverview/lib/scraper.py b/LTA/ltastorageoverview/lib/scraper.py index fd5a6d6dcf3..b6729796796 100755 --- a/LTA/ltastorageoverview/lib/scraper.py +++ b/LTA/ltastorageoverview/lib/scraper.py @@ -39,6 +39,8 @@ from random import random, randint logger = logging.getLogger() +VISIT_INTERVAL = datetime.timedelta(days=3) + class FileInfo: '''Simple struct to hold filename and size''' def __init__(self, filename, size, created_at): @@ -123,8 +125,9 @@ class Location: # the core command: do an srmls call and parse the results # srmls can only yield max 900 items in a result, hence we can recurse for the next 900 by using the offset - lexar_nr = randint(1,2) - cmd = ['ssh', 'ingest@10.178.1.%d' % lexar_nr, "bash", "-c", "\'source %s;srmls -l -count=900 -offset=%d %s%s\'" % ( + lexar_nr = 4 #randint(3,4) + lexar_host = 'ingest@10.178.1.%d' % (lexar_nr,) + cmd = ['ssh', '-tt', '-n', '-x', '-q', lexar_host, "bash", "-c", "\'source %s;srmls -l -count=900 -offset=%d %s%s\'" % ( '/globalhome/ingest/service/bin/init.sh' if lexar_nr <= 2 else '/globalhome/ingest/.grid/.ingest_profile', offset, self.srmurl, @@ -168,7 +171,7 @@ class Location: raise ParseException("Could not parse dirname from line: %s\nloglines:\n%s" % (pathLineItems[1], logs[0])) - foundDirectories.append(Location(self.srmurl, dirname)) + foundDirectories.append(Location(self.srmurl, dirname.rstrip('/'))) elif entryType.lower() == 'file': try: filesize = int(pathLineItems[0]) @@ -176,7 +179,7 @@ class Location: timestamplines = [x for x in lines if 'ed at:' in x] timestampline = None for line in timestamplines: - if 'created' in line: + if 'created' in line and '1970' not in line: timestampline = line break timestampline = line @@ -218,7 +221,7 @@ class LocationResult: A list of files in this location ''' self.location = location - self.subDirectories = subDirectories if subDirectories else [] + self.subDirectories = [s.rstrip('/') for s in subDirectories] if subDirectories else [] self.files = files if files else [] def __str__(self): @@ -238,18 +241,18 @@ class ResultGetterThread(threading.Thread): '''Helper class to query Locations asynchronously for results. Gets the result for the first Location in the locations deque and appends it to the results deque Appends the subdirectory Locations at the end of the locations deque for later processing''' - def __init__(self, dbcreds, dir_id, verbose=False): + def __init__(self, dbcreds, dir_id, log_queries=False): threading.Thread.__init__(self) self.daemon = True self.dbcreds = dbcreds - self.verbose = verbose + self.log_queries = log_queries self.dir_id = dir_id def run(self): '''A single location is pop\'ed from the locations deque and the results are queried. Resulting subdirectories are appended to the locations deque''' try: - with store.LTAStorageDb(self.dbcreds, self.verbose) as db: + with store.LTAStorageDb(self.dbcreds, self.log_queries) as db: dir = db.directory(self.dir_id) if not dir: @@ -265,28 +268,89 @@ class ResultGetterThread(threading.Thread): location = Location(srm_url, dir_name) try: + def rescheduleVisit(): + for i in range(5): + try: + with store.LTAStorageDb(self.dbcreds, self.log_queries) as db: + logger.info('Rescheduling %s for new visit.' % (location.path(),)) + db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL + datetime.timedelta(mins=1)) + break + except: + time.sleep(1) + + # get results... long blocking result = location.getResult() logger.info(result) - with store.LTAStorageDb(self.dbcreds, self.verbose) as db: - if result.files: - result_file_tuple_set = set( (file.filename.split('/')[-1], int(file.size), file.created_at, dir_id) for file in result.files) - - #check for already known files in the db - known_file_tuple_set = set((str(file['name']), int(file['size']), file['creation_date'], dir_id) for file in db.filesInDirectory(dir_id)) - new_file_tuple_set = result_file_tuple_set - known_file_tuple_set; - - logger.info("%s %s: %d out of %d files are new, and %d are already known", site['name'], dir_name, len(new_file_tuple_set), len(result_file_tuple_set), len(known_file_tuple_set)) - - if new_file_tuple_set: - file_ids = db.insertFileInfos(new_file_tuple_set) - - if len(file_ids) != len(new_file_tuple_set): - logger.info('Rescheduling %s for new visit.' % (location.path(),)) - db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - datetime.timedelta(days=1000)) - - subDirectoryNames = [loc.directory for loc in result.subDirectories] + with store.LTAStorageDb(self.dbcreds, self.log_queries) as db: + # convert the result.files list into a dict + #with (filename, dir_id) as key and a tuple with all file info as value + result_file_tuple_dict = {} + for file in result.files: + filename = file.filename.split('/')[-1] + key = (filename, dir_id) + file_tuple = (filename, int(file.size), file.created_at, dir_id) + result_file_tuple_dict[key] = file_tuple + + # create a dict of all already known files from the db + known_file_dict = {} + for file in db.filesInDirectory(dir_id): + key = (str(file['name']), dir_id) + known_file_dict[key] = file + + # now compare the result and known (filename, dir_id) sets + # and find out which a new, and which are known. + # compare only by (filename, dir_id) because for a given file the size and/or date might have changed, + # but that does not make it a new/unique file. + result_file_key_set = set(result_file_tuple_dict.keys()) + known_file_key_set = set(known_file_dict.keys()) + new_file_key_set = result_file_key_set - known_file_key_set + removed_file_key_set = known_file_key_set - result_file_key_set + + logger.info("%s %s: %d out of %d files are new, and %d are already known", site['name'], + dir_name, + len(new_file_key_set), + len(result_file_key_set), + len(known_file_key_set)) + + if new_file_key_set: + new_file_tuple_set = [result_file_tuple_dict[key] for key in new_file_key_set] + file_ids = db.insertFileInfos(new_file_tuple_set) + + if len(file_ids) != len(new_file_tuple_set): + rescheduleVisit() + + if known_file_key_set: + for key, known_file in known_file_dict.items(): + if key in result_file_tuple_dict: + result_file_tuple = result_file_tuple_dict[key] + + known_size = int(known_file['size']) + known_creation_date = known_file['creation_date'] + + result_size = result_file_tuple[1] + result_creation_date = result_file_tuple[2] + + if known_size != result_size: + logger.info("%s %s: updating %s (id=%d) size from %d to %d", + site['name'], dir_name, known_file['name'], known_file['id'], + known_size, result_size) + db.updateFileInfoSize(known_file['id'], result_size) + + if removed_file_key_set: + for removed_file_key in removed_file_key_set: + db.deleteFileInfoFromDirectory(removed_file_key[0], removed_file_key[1]) + + # skip empty nikhef dirs + filteredSubDirectories = [loc for loc in result.subDirectories + if not ('nikhef' in loc.srmurl and 'generated' in loc.directory) ] + + # skip sksp spectroscopy project + filteredSubDirectories = [loc for loc in filteredSubDirectories + if not ('sara' in loc.srmurl and 'sksp' in loc.directory and 'spectro' in loc.directory) ] + + subDirectoryNames = [loc.directory for loc in filteredSubDirectories] if subDirectoryNames: #check for already known subdirectories in the db @@ -300,41 +364,39 @@ class ResultGetterThread(threading.Thread): subdir_ids = db.insertSubDirectories(new_subdir_name_set, dir_id) if len(subdir_ids) != len(new_subdir_name_set): - logger.info('Rescheduling %s for new visit.' % (location.path(),)) - db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - datetime.timedelta(days=1000)) + rescheduleVisit() except (SrmlsException, ParseException) as e: logger.error('Error while scanning %s\n%s' % (location.path(), str(e))) - with store.LTAStorageDb(self.dbcreds, self.verbose) as db: - logger.info('Rescheduling %s for new visit.' % (location.path(),)) - db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - datetime.timedelta(days=1000)) + if 'does not exist' in str(e): + with store.LTAStorageDb(self.dbcreds, self.log_queries) as db: + db.deleteDirectory(self.dir_id) + else: + rescheduleVisit() except Exception as e: logger.error(str(e)) - with store.LTAStorageDb(self.dbcreds, self.verbose) as db: + with store.LTAStorageDb(self.dbcreds, self.log_queries) as db: logger.info('Rescheduling dir_id %d for new visit.' % (self.dir_id,)) - db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - datetime.timedelta(days=1000)) + db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL) def populateDbWithLTASitesAndRootDirs(db): if not db.sites(): - #db.insertSite('target', 'srm://srm.target.rug.nl:8444') #db.insertSite('nikhef', 'srm://tbn18.nikhef.nl:8446') - db.insertSite('sara', 'srm://srm.grid.sara.nl:8443') - db.insertSite('juelich', 'srm://lofar-srm.fz-juelich.de:8443') - db.insertSite('poznan', 'srm://lta-head.lofar.psnc.pl:8443') + sara_id = db.insertSite('sara', 'srm://srm.grid.sara.nl:8443') + juelich_id = db.insertSite('juelich', 'srm://lofar-srm.fz-juelich.de:8443') + poznan_id = db.insertSite('poznan', 'srm://lta-head.lofar.psnc.pl:8443') + + for site_id in [sara_id, juelich_id, poznan_id]: + for i in range(1, 8): + db.executeQuery('insert into lta.site_quota(site_id, quota, valid_until_date) values (%s, %s, %s);', (site_id, long(i*1e15), '201%d-12-31 23:59:59'%i)) - #db.insertRootDirectory('target', '/lofar/ops') - #db.insertRootDirectory('target', '/lofar/ops/disk') #db.insertRootDirectory('nikhef', '/dpm/nikhef.nl/home/lofar') - db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/ops') - db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/user') - db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/software') - db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/storage') - db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/pulsar') + db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar') db.insertRootDirectory('juelich', '/pnfs/fz-juelich.de/data/lofar/ops') - db.insertRootDirectory('poznan', '/lofar/ops/projects') + db.insertRootDirectory('poznan', '/lofar/ops') def main(): @@ -355,13 +417,15 @@ def main(): logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG if options.verbose else logging.INFO) - dbcreds = dbcredentials.parse_options(options) + options.parallel = max(1, min(8*multiprocessing.cpu_count(), options.parallel)) + logger.info("Using maximum number of parallel srmls jobs: %d" % options.parallel) + dbcreds = dbcredentials.parse_options(options) logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword()) db = store.LTAStorageDb(dbcreds, options.log_queries) populateDbWithLTASitesAndRootDirs(db) - + # for each site we want one or more ResultGetterThreads # so make a dict with a list per site based on the locations getters = dict([(site['name'],[]) for site in db.sites()]) @@ -369,85 +433,86 @@ def main(): # some helper functions def numLocationsInQueues(): '''returns the total number of locations in the queues''' - return db.numDirectoriesNotVisitedSince(datetime.datetime.utcnow() - datetime.timedelta(days=7)) + return db.numDirectoriesNotVisitedSince(datetime.datetime.utcnow() - VISIT_INTERVAL) def totalNumGetters(): '''returns the total number of parallel running ResultGetterThreads''' return sum([len(v) for v in getters.values()]) - #print 'numLocationsInQueues()', numLocationsInQueues() - #print 'totalNumGetters()', totalNumGetters() + def cleanupFinishedGetters(): + # get rid of old finished ResultGetterThreads + finishedGetters = dict([(site_name, [getter for getter in getterList if not getter.isAlive()]) for site_name, getterList in getters.items()]) + for site_name,finishedGetterList in finishedGetters.items(): + for finishedGetter in finishedGetterList: + getters[site_name].remove(finishedGetter) + + + # the main loop + # loop over the locations and spawn ResultGetterThreads to get the results parallel + # use load balancing over the different sites and with respect to queue lengths + # do not overload this host system + while True: - # only enter main loop if there is anything to process - if numLocationsInQueues() > 0: + cleanupFinishedGetters() - # the main loop - # loop over the locations and spawn ResultGetterThreads to get the results parallel - # use load balancing over the different sites and with respect to queue lengths + # spawn new ResultGetterThreads # do not overload this host system - while numLocationsInQueues() > 0 or totalNumGetters() > 0: - - # get rid of old finished ResultGetterThreads - finishedGetters = dict([(site_name, [getter for getter in getterList if not getter.isAlive()]) for site_name, getterList in getters.items()]) - for site_name,finishedGetterList in finishedGetters.items(): - for finishedGetter in finishedGetterList: - getters[site_name].remove(finishedGetter) - - # spawn new ResultGetterThreads - # do not overload this host system - #while numLocationsInQueues() > 0 and (totalNumGetters() <= 4 or - #(os.getloadavg()[0] < 3*multiprocessing.cpu_count() and - #totalNumGetters() < 2.5*multiprocessing.cpu_count())): - while numLocationsInQueues() > 0 and (totalNumGetters() < options.parallel): - sitesStats = db.visitStats(datetime.datetime.utcnow() - datetime.timedelta(days=7)) - - for site_name, site_stats in sitesStats.items(): - numGetters = len(getters[site_name]) - queue_length = site_stats['queue_length'] - weight = float(queue_length) / float(20 * (numGetters + 1)) - if numGetters == 0 and queue_length > 0: - weight = 1e6 # make getterless sites extra important, so each site keeps flowing - site_stats['# get'] = numGetters - site_stats['weight'] = weight - - totalWeight = max(1.0, sum([site_stats['weight'] for site_stats in sitesStats.values()])) - - logger.debug("siteStats:\n%s" % str('\n'.join([str((k, v)) for k, v in sitesStats.items()]))) - - # now pick a random site using the weights - chosen_site_name = None - cumul = 0.0 - r = random() - for site_name,site_stats in sitesStats.items(): - ratio = site_stats['weight']/totalWeight - cumul += ratio - - if r <= cumul and site_stats['queue_length'] > 0: - chosen_site_name = site_name - break - - if not chosen_site_name: + while (numLocationsInQueues() > 0 and + totalNumGetters() < options.parallel and + os.getloadavg()[0] < 4*multiprocessing.cpu_count()): + sitesStats = db.visitStats(datetime.datetime.utcnow() - VISIT_INTERVAL) + + for site_name, site_stats in sitesStats.items(): + numGetters = len(getters[site_name]) + queue_length = site_stats['queue_length'] + weight = float(queue_length) / float(20 * (numGetters + 1)) + if numGetters == 0 and queue_length > 0: + weight = 1e6 # make getterless sites extra important, so each site keeps flowing + site_stats['# get'] = numGetters + site_stats['weight'] = weight + + totalWeight = max(1.0, sum([site_stats['weight'] for site_stats in sitesStats.values()])) + + logger.debug("siteStats:\n%s" % str('\n'.join([str((k, v)) for k, v in sitesStats.items()]))) + + # now pick a random site using the weights + chosen_site_name = None + cumul = 0.0 + r = random() + for site_name,site_stats in sitesStats.items(): + ratio = site_stats['weight']/totalWeight + cumul += ratio + + if r <= cumul and site_stats['queue_length'] > 0: + chosen_site_name = site_name break - chosen_dir_id = sitesStats[chosen_site_name]['least_recent_visited_dir_id'] - db.updateDirectoryLastVisitTime(chosen_dir_id, datetime.datetime.utcnow()) + if not chosen_site_name: + break + + chosen_dir_id = sitesStats[chosen_site_name]['least_recent_visited_dir_id'] + db.updateDirectoryLastVisitTime(chosen_dir_id, datetime.datetime.utcnow()) + + logger.debug("chosen_site_name: %s chosen_dir_id: %s", chosen_site_name, chosen_dir_id) - logger.debug("chosen_site_name: %s chosen_dir_id: %s", chosen_site_name, chosen_dir_id) + # make and start a new ResultGetterThread the location deque of the chosen site + newGetter = ResultGetterThread(dbcreds, chosen_dir_id, options.log_queries) + newGetter.start() + getters[chosen_site_name].append(newGetter) - # make and start a new ResultGetterThread the location deque of the chosen site - newGetter = ResultGetterThread(dbcreds, chosen_dir_id, options.log_queries) - newGetter.start() - getters[chosen_site_name].append(newGetter) + cleanupFinishedGetters() - logger.info('numLocationsInQueues=%d totalNumGetters=%d siteQueueLengths: %s' % (numLocationsInQueues(), totalNumGetters(), - ' '.join(['%s:%d' % (name, stats['queue_length']) for name, stats in sitesStats.items()]))) + logger.info('numLocationsInQueues=%d totalNumGetters=%d siteQueueLengths: %s load_5min: %.1f' % (numLocationsInQueues(), + totalNumGetters(), + ' '.join(['%s:%d' % (name, stats['queue_length']) for name, stats in sitesStats.items()]), + os.getloadavg()[0])) - # sleep before main loop next iteration - # to wait for some results - # and some getters to finish - time.sleep(0.25 if numLocationsInQueues() == 0 else 5) + # sleep before main loop next iteration + # to wait for some results + # and some getters to finish + time.sleep(5 if numLocationsInQueues() <= options.parallel else 0.25) - # all locations were processed + # all locations were processed if __name__ == "__main__": main() diff --git a/LTA/ltastorageoverview/lib/store.py b/LTA/ltastorageoverview/lib/store.py index 3bb632a6b0f..1afecc829c3 100644 --- a/LTA/ltastorageoverview/lib/store.py +++ b/LTA/ltastorageoverview/lib/store.py @@ -23,12 +23,8 @@ # TODO: reuse connection in methods (take care of exceptions closing the connection) # TODO: use generators and yield for faster and more memory efficient processing of results. -import os -import os.path import datetime import logging -import psycopg2 -import psycopg2.extras from lofar.common import dbcredentials from lofar.common.postgres import PostgresDatabaseConnection from lofar.common.postgres import FETCH_NONE,FETCH_ONE,FETCH_ALL @@ -39,7 +35,7 @@ class EntryNotFoundException(Exception): pass class LTAStorageDb(PostgresDatabaseConnection): - def __init__(self, dbcreds=None, log_queries=False): + def __init__(self, dbcreds=None, log_queries=True): super(LTAStorageDb, self).__init__(dbcreds=dbcreds, log_queries=log_queries) def insertSite(self, siteName, srmurl): @@ -48,7 +44,7 @@ class LTAStorageDb(PostgresDatabaseConnection): if site: return site['id'] - site_id = self.executeQuery('insert into lta.storage_site (name, url) values (%s, %s) returning id;', (siteName, srmurl), fetch=FETCH_ONE)['id'] + site_id = self.executeQuery('insert into lta.site (name, url) values (%s, %s) returning id;', (siteName, srmurl), fetch=FETCH_ONE)['id'] self.commit() return site_id @@ -62,12 +58,12 @@ class LTAStorageDb(PostgresDatabaseConnection): dir_id = self.executeQuery('insert into lta.directory (name) values (%s) returning id;', [rootDirectory], fetch=FETCH_ONE)['id'] - self.executeQuery('insert into lta.storage_site_root_dir (storage_site_id, directory_id) values (%s, %s);', (site_id, dir_id)) + self.executeQuery('insert into lta.site_root_dir (site_id, root_dir_id) values (%s, %s);', (site_id, dir_id)) self.commit() return dir_id - def insertSubDirectory(self, parent_directory_id, sub_directory_path): - result = self.executeQuery('insert into lta.directory (name, parent_directory_id) values (%s, %s) returning id;', (sub_directory_path, parent_directory_id), fetch=FETCH_ONE) + def insertSubDirectory(self, parent_dir_id, sub_directory_path): + result = self.executeQuery('insert into lta.directory (name, parent_dir_id) values (%s, %s) returning id;', (sub_directory_path, parent_dir_id), fetch=FETCH_ONE) if result and 'id' in result: self.commit() @@ -79,7 +75,7 @@ class LTAStorageDb(PostgresDatabaseConnection): with self._connection.cursor() as cursor: insert_values = ','.join(cursor.mogrify('(%s, %s)', (name, parentDirId)) for name in subDirectoryPaths) - query = '''insert into lta.directory (name, parent_directory_id) + query = '''insert into lta.directory (name, parent_dir_id) VALUES {values} RETURNING id;'''.format(values=insert_values) @@ -94,7 +90,7 @@ class LTAStorageDb(PostgresDatabaseConnection): with self._connection.cursor() as cursor: insert_values = ','.join(cursor.mogrify('(%s, %s)', (directoryLastVisitTime, id)) for id in subDirIds) - query = '''insert into scraper.last_directory_visit (visit_date, directory_id) + query = '''insert into scraper.last_directory_visit (visit_date, dir_id) VALUES {values} RETURNING id;'''.format(values=insert_values) @@ -108,9 +104,16 @@ class LTAStorageDb(PostgresDatabaseConnection): self.commit() return subDirIds - def insertFileInfo(self, name, size, creation_date, parent_directory_id, commit=True): - fileinfo_id = self.executeQuery('insert into lta.fileinfo (name, size, creation_date, directory_id) values (%s, %s, %s, %s) returning id;', - (name.split('/')[-1], size, creation_date, parent_directory_id)) + def deleteDirectory(self, dir_id, commit=True): + self.executeQuery('DELETE FROM lta.directory where id = %s;', (dir_id,), fetch=FETCH_NONE) + + if commit: + self.commit() + + + def insertFileInfo(self, name, size, creation_date, parent_dir_id, commit=True): + fileinfo_id = self.executeQuery('insert into lta.fileinfo (name, size, creation_date, dir_id) values (%s, %s, %s, %s) returning id;', + (name.split('/')[-1], size, creation_date, parent_dir_id)) if commit: self.commit() @@ -122,7 +125,7 @@ class LTAStorageDb(PostgresDatabaseConnection): insert_values = ','.join([x for x in insert_values]) - query = '''insert into lta.fileinfo (name, size, creation_date, directory_id) + query = '''insert into lta.fileinfo (name, size, creation_date, dir_id) VALUES {values} RETURNING id;'''.format(values=insert_values) @@ -136,80 +139,107 @@ class LTAStorageDb(PostgresDatabaseConnection): self.commit() return ids - def updateDirectoryLastVisitTime(self, directory_id, timestamp): - updated = bool(self.executeQuery('''update scraper.last_directory_visit - set visit_date=%s - where directory_id = %s - returning id;''', (timestamp, directory_id), fetch=FETCH_ONE)) + def updateFileInfoSize(self, id, size, commit=True): + fileinfo_id = self.executeQuery('''update lta.fileinfo set size=%s where id=%s;''', (size, id)) - if not updated: - self.executeQuery('''insert into scraper.last_directory_visit - (visit_date, directory_id) - values (%s, %s)''', (timestamp, directory_id)) + if commit: + self.commit() - self.commit() + def deleteFileInfoFromDirectory(self, file_name, dir_id, commit=True): + self.executeQuery('DELETE FROM lta.fileinfo where dir_id = %s and name = %s;', (dir_id,file_name), fetch=FETCH_NONE) + + if commit: + self.commit() + + def updateDirectoryLastVisitTime(self, dir_id, timestamp, commit=True): + self.executeQuery('''update scraper.last_directory_visit + set visit_date=%s + where dir_id = %s;''', (timestamp, dir_id), fetch=FETCH_NONE) + + if commit: + self.commit() def sites(self): '''returns list of tuples (id, name, url) of all sites''' - return self.executeQuery('SELECT id, name, url FROM lta.storage_site;', fetch=FETCH_ALL) + return self.executeQuery('SELECT id, name, url FROM lta.site;', fetch=FETCH_ALL) def site(self, site_id): '''returns tuple (id, name, url) for site with id=site_id''' - return self.executeQuery('SELECT id, name, url FROM lta.storage_site where id = %s;', [site_id], FETCH_ONE) + return self.executeQuery('SELECT id, name, url FROM lta.site where id = %s;', [site_id], FETCH_ONE) def siteByName(self, site_name): '''returns tuple (id, name, url) for site with id=site_id''' - return self.executeQuery('SELECT id, name, url FROM lta.storage_site where name = %s;', [site_name], FETCH_ONE) + return self.executeQuery('SELECT id, name, url FROM lta.site where name = %s;', [site_name], FETCH_ONE) + + def siteQuota(self, site_id): + '''returns list of quota tuples (site_id, site_name, quota, valid_until_date)''' + return self.executeQuery('SELECT * FROM lta.site_quota;', FETCH_All) - def directory(self, directory_id): - '''returns lta.directory (id, name, site_id, site_name) for the given directory_id''' + def directory(self, dir_id): + '''returns lta.directory (id, name, site_id, site_name) for the given dir_id''' return self.executeQuery('''SELECT dir.id as dir_id, dir.name as dir_name, site.id as site_id, site.name as site_name - FROM lta.storage_site_root_dir - join lta.storage_site site on site.id = storage_site_root_dir.storage_site_id - join lta.directory_closure dc on dc.ancestor_id = storage_site_root_dir.directory_id + FROM lta.site_root_dir + join lta.site site on site.id = site_root_dir.site_id + join lta.directory_closure dc on dc.ancestor_id = site_root_dir.root_dir_id join lta.directory dir on dir.id = dc.descendant_id where dc.descendant_id = %s; - ''', [directory_id], fetch=FETCH_ONE) + ''', [dir_id], fetch=FETCH_ONE) - def directory_id(self, site_id, directory_name): + def directoryByName(self, dir_name): + '''returns lta.directory (id, name, site_id, site_name) for the given dir_name''' + return self.executeQuery('''SELECT dir.id as dir_id, dir.name as dir_name, site.id as site_id, site.name as site_name + FROM lta.site_root_dir + join lta.site site on site.id = site_root_dir.site_id + join lta.directory_closure dc on dc.ancestor_id = site_root_dir.root_dir_id + join lta.directory dir on dir.id = dc.descendant_id + where dir.name = %s; + ''', [dir_name], fetch=FETCH_ONE) + + def dir_id(self, site_id, directory_name): '''returns lta.directory id for the given site_id, directory_name''' result = self.executeQuery('''SELECT dir.id - FROM lta.storage_site_root_dir - join lta.directory_closure dc on dc.ancestor_id = storage_site_root_dir.directory_id + FROM lta.site_root_dir + join lta.directory_closure dc on dc.ancestor_id = site_root_dir.root_dir_id join lta.directory dir on dir.id = dc.descendant_id - where storage_site_root_dir.storage_site_id = %s + where site_root_dir.site_id = %s and dir.name = %s;''', [site_id, directory_name], fetch=FETCH_ONE) - if result and result['id']: + if result['id']: return result['id'] return -1 def rootDirectories(self): '''returns list of all root directories for all sites''' - return self.executeQuery('''SELECT * FROM lta.root_directories;''', fetch=FETCH_ALL) + return self.executeQuery('''SELECT * FROM lta.site_root_directory;''', fetch=FETCH_ALL) def rootDirectoriesForSite(self, site_id): '''returns list of all root directories (id, name) for given site_id''' - return self.executeQuery('''SELECT * FROM lta.root_directories where site_id = %s;''', [site_id], fetch=FETCH_ALL) - - def subDirectories(self, directory_id, depth = 1, includeSelf=False): - '''returns list of all sub directories up to the given depth (id, name, parent_dir_id, depth) for the given directory_id''' + return self.executeQuery('''SELECT * FROM lta.site_root_directory where site_id = %s;''', [site_id], fetch=FETCH_ALL) + + def subDirectories(self, dir_id, depth = 1, includeSelf=False): + '''returns list of all sub directories up to the given depth (id, name, parent_dir_id, depth) for the given dir_id''' + if depth == 1 and not includeSelf: + return self.executeQuery(''' + SELECT dir.id as id, dir.name as name, dir.parent_dir_id as parent_dir_id + FROM lta.directory dir + where dir.parent_dir_id = %s; + ''', (dir_id, ), fetch=FETCH_ALL) return self.executeQuery(''' - SELECT dir.id as id, dir.name as name, dir.parent_directory_id as parent_dir_id, lta.directory_closure.depth as depth + SELECT dir.id as id, dir.name as name, dir.parent_dir_id as parent_dir_id, lta.directory_closure.depth as depth FROM lta.directory_closure join lta.directory dir on dir.id = lta.directory_closure.descendant_id where ancestor_id = %s and depth <= %s and depth > %s order by depth asc; - ''', (directory_id, depth, -1 if includeSelf else 0), fetch=FETCH_ALL) + ''', (dir_id, depth, -1 if includeSelf else 0), fetch=FETCH_ALL) - def parentDirectories(self, directory_id): + def parentDirectories(self, dir_id): return self.executeQuery(''' SELECT dir.* FROM lta.directory_closure dc join lta.directory dir on dir.id = dc.ancestor_id where dc.descendant_id = %s and depth > 0 order by depth desc; - ''', [directory_id], fetch=FETCH_ALL) + ''', [dir_id], fetch=FETCH_ALL) def _date_bounded(self, query, args, table_column, from_date=None, to_date=None): result_query = query @@ -224,21 +254,21 @@ class LTAStorageDb(PostgresDatabaseConnection): return result_query, result_args - def filesInDirectory(self, directory_id, from_date=None, to_date=None): + def filesInDirectory(self, dir_id, from_date=None, to_date=None): query = '''SELECT * FROM lta.fileinfo - where directory_id = %s''' + where dir_id = %s''' - args = (directory_id,) + args = (dir_id,) query, args = self._date_bounded(query, args, 'fileinfo.creation_date', from_date, to_date) return self.executeQuery(query, args, fetch=FETCH_ALL) - def numFilesInDirectory(self, directory_id, from_date=None, to_date=None): + def numFilesInDirectory(self, dir_id, from_date=None, to_date=None): query = '''SELECT count(id) FROM lta.fileinfo - where directory_id = %s''' + where dir_id = %s''' - args = (directory_id,) + args = (dir_id,) query, args = self._date_bounded(query, args, 'fileinfo.creation_date', from_date, to_date) @@ -249,87 +279,60 @@ class LTAStorageDb(PostgresDatabaseConnection): return 0 - def filesInTree(self, base_directory_id, from_date=None, to_date=None): + def directoryTreeStats(self, dir_id): + query = '''SELECT * FROM metainfo.stats WHERE dir_id = %s''' + args = (dir_id,) + + return self.executeQuery(query, args, fetch=FETCH_ONE) + + def filesInTree(self, base_dir_id, from_date=None, to_date=None): query = '''SELECT dir.id as dir_id, dir.name as dir_name, dc.depth as dir_depth, fi.id as file_id, fi.name as file_name, fi.size as file_size, fi.creation_date as file_creation_date FROM lta.directory_closure dc JOIN lta.directory dir on dir.id = dc.descendant_id - JOIN lta.fileinfo fi on fi.directory_id = dc.descendant_id + JOIN lta.fileinfo fi on fi.dir_id = dc.descendant_id WHERE dc.ancestor_id = %s''' - args = (base_directory_id,) + args = (base_dir_id,) - query, args = self._date_bounded(query, args, 'fileinfo.creation_date', from_date, to_date) + query, args = self._date_bounded(query, args, 'fi.creation_date', from_date, to_date) return self.executeQuery(query, args, fetch=FETCH_ALL) - def numFilesInTree(self, base_directory_id, from_date=None, to_date=None): - query = ''' - SELECT sum(ds.num_files) FROM metainfo.directory_stats ds - join lta.directory_closure dc on dc.descendant_id = ds.directory_id - where ancestor_id = %s - ''' - - args = (base_directory_id,) - - query, args = self._date_bounded(query, args, 'ds.min_file_creation_date', from_date=from_date) - query, args = self._date_bounded(query, args, 'ds.max_file_creation_date', to_date=to_date) - - result = self.executeQuery(query, args, fetch=FETCH_ONE) + def totalFileSizeAndNumFilesInSite(self, site_id, from_date=None, to_date=None): + query = '''SELECT * FROM metainfo.get_site_stats(%s, %s, %s)''' + args = (site_id, from_date, to_date) - if result['sum']: - return result['sum'] + return self.executeQuery(query, args, fetch=FETCH_ONE) - return 0 + def totalFileSizeAndNumFilesInTree(self, base_dir_id, from_date=None, to_date=None): + query = '''SELECT * FROM metainfo.get_tree_stats(%s, %s, %s)''' + args = (base_dir_id, from_date, to_date) - def totalFileSizeInTree(self, base_directory_id, from_date=None, to_date=None): - query = ''' - SELECT sum(ds.total_file_size) FROM metainfo.directory_stats ds - join lta.directory_closure dc on dc.descendant_id = ds.directory_id - where ancestor_id = %s - ''' - args = (base_directory_id,) + return self.executeQuery(query, args, fetch=FETCH_ONE) - query, args = self._date_bounded(query, args, 'ds.min_file_creation_date', from_date=from_date) - query, args = self._date_bounded(query, args, 'ds.max_file_creation_date', to_date=to_date) + def totalFileSizeInTree(self, base_dir_id, from_date=None, to_date=None): + return self.totalFileSizeAndNumFilesInTree(base_dir_id, from_date, to_date)['tree_total_file_size'] - result = self.executeQuery(query, args, fetch=FETCH_ONE) - - if result['sum']: - return result['sum'] - return 0 + def numFilesInTree(self, base_dir_id, from_date=None, to_date=None): + return self.totalFileSizeAndNumFilesInTree(base_dir_id, from_date, to_date)['tree_num_files'] def numFilesInSite(self, site_id, from_date=None, to_date=None): - num_files = 0L - - root_dirs = self.rootDirectoriesForSite(site_id) - - for root_dir in root_dirs: - num_files += long(self.numFilesInTree(root_dir['dir_id'], from_date, to_date)) - - return num_files + return self.totalFileSizeAndNumFilesInSite(site_id, from_date, to_date)['tree_num_files'] def totalFileSizeInSite(self, site_id, from_date=None, to_date=None): - total_size = 0L - - root_dirs = self.rootDirectoriesForSite(site_id) - - for root_dir in root_dirs: - total_size += long(self.totalFileSizeInTree(root_dir['dir_id'], from_date, to_date)) + return self.totalFileSizeAndNumFilesInSite(site_id, from_date, to_date)['tree_total_file_size'] - return total_size + def datetimeRangeOfFilesInTree(self, base_dir_id = None): + query = '''SELECT min(fileinfo.creation_date) as min_creation_date, + max(fileinfo.creation_date) as max_creation_date + FROM lta.fileinfo + LIMIT 1''' + args = None - def datetimeRangeOfFilesInTree(self, base_directory_id = None): - query = ''' - SELECT min(fileinfo.creation_date) as min_creation_date, - max(fileinfo.creation_date) as max_creation_date - FROM lta.fileinfo - ''' - args = [] - - if base_directory_id: - query += '''\njoin lta.directory_closure dc on dc.descendant_id = lta.fileinfo.directory_id + if base_dir_id: + query += '''\njoin lta.directory_closure dc on dc.descendant_id = lta.fileinfo.dir_id where ancestor_id = %s''' - args.append(base_directory_id) + args = [base_dir_id] result = self.executeQuery(query, args, fetch=FETCH_ONE) @@ -353,7 +356,7 @@ class LTAStorageDb(PostgresDatabaseConnection): def numDirectoriesNotVisitedSince(self, timestamp): result = self.executeQuery(''' - SELECT count(directory_id) FROM scraper.last_directory_visit + SELECT count(dir_id) FROM scraper.last_directory_visit WHERE visit_date < %s ''', [timestamp], fetch=FETCH_ONE) @@ -362,6 +365,12 @@ class LTAStorageDb(PostgresDatabaseConnection): return 0 + def siteQuotaUsages(self): + return self.executeQuery('''SELECT * FROM metainfo.site_quota_usage;''', fetch=FETCH_ALL) + + def siteQuotaRootDirStats(self): + return self.executeQuery('''SELECT * FROM metainfo.site_quota_root_dir_stats;''', fetch=FETCH_ALL) + def visitStats(self, before_timestamp = None): if not before_timestamp: before_timestamp = datetime.datetime.utcnow() @@ -376,7 +385,7 @@ class LTAStorageDb(PostgresDatabaseConnection): visits = self.executeQuery(''' select * - from scraper.site_scraper_last_directoy_visit + from scraper.site_scraper_last_directory_visit where site_id = %s and last_visit < %s order by last_visit asc @@ -389,13 +398,12 @@ class LTAStorageDb(PostgresDatabaseConnection): return siteStats + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) - import pprint dbcreds = dbcredentials.DBCredentials().get('LTASO') - db = LTAStorageDb(dbcreds, True) - print db.rootDirectoriesForSite(1) - print db.directory_id(1, 'rootDir_0') - pprint.pprint(db.parentDirectories(7)) - pprint.pprint(db.subDirectories(1)) + with LTAStorageDb(dbcreds, True) as db: + print db.rootDirectoriesForSite(1) + print db.dir_id(1, 'rootDir_0') diff --git a/LTA/ltastorageoverview/lib/webservice/templates/index.html b/LTA/ltastorageoverview/lib/webservice/templates/index.html index 174bac4088b..f57e0a93b91 100644 --- a/LTA/ltastorageoverview/lib/webservice/templates/index.html +++ b/LTA/ltastorageoverview/lib/webservice/templates/index.html @@ -5,10 +5,21 @@ <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <title>{{title}}</title> - + <meta http-equiv="refresh" content="300"> <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script> <script type="text/javascript" src="http://code.highcharts.com/highcharts.js"></script> <script type="text/javascript" src="http://code.highcharts.com/modules/exporting.js"></script> + <style> + table, th, td { + border: 1px solid black; + border-collapse: collapse; + text-align: right; + font-size: 14px; + } + th, td { + padding: 8px; + } +</style> </head> <body> <script type="text/javascript"> @@ -21,10 +32,11 @@ $('#usage_piechart_container').highcharts({ chart: { - plotBackgroundColor: null, - plotBorderWidth: null, - plotShadow: false, - type: 'pie' + animation: false, + plotBackgroundColor: null, + plotBorderWidth: null, + plotShadow: false, + type: 'pie' }, title: { text: '<span style="font-size: 22px">{{storagesitetitle|safe}}</span>', @@ -38,6 +50,7 @@ }, plotOptions: { pie: { + animation: false, allowPointSelect: true, cursor: 'pointer', dataLabels: { @@ -56,12 +69,12 @@ colorByPoint: true, data: {{storagesitedata|safe}} }] - }); + }); $(function () { $('#usage_trend_container').highcharts({ - chart: { type: 'area'}, + chart: { type: 'area', animation: false}, title: { text: '<span style="font-size: 22px">LTA Storage Site Usage Trend</span>', }, @@ -96,6 +109,7 @@ }, plotOptions: { area: { + animation: false, stacking: 'normal', lineColor: '#666666', lineWidth: 1, @@ -112,9 +126,9 @@ $(function () { $('#usage_deltas_container').highcharts({ - chart: { type: 'column'}, + chart: { type: 'column', animation: false}, title: { - text: '<span style="font-size: 22px">LTA Storage Site Deltas Per Month</span>', + text: '<span style="font-size: 22px">LTA Storage Site Deltas</span>', }, xAxis: { type: 'datetime', @@ -145,6 +159,7 @@ }, plotOptions: { column: { + animation: false, stacking:'normal', pointPadding: 0.01, groupPadding: 0.0, @@ -154,13 +169,68 @@ series: {{deltas_per_month_series|safe}} }); }); + }); + $(function () { + $('#quotas_chart_container').highcharts({ + chart: { type: 'column', + animation: false }, + title: { + text: '<span style="font-size: 22px">LTA Storage Site Tape Quota</span>', + }, + xAxis: { + type: 'datetime', + labels: { style: { fontSize: '22px'} }, + }, + yAxis: { + title: { text: '<span style="font-size: 22px">PB</span>'}, + labels: { + style: { fontSize: '22px'}, + formatter: function () { return this.value / 1e15;}}, + }, + legend: { itemStyle: { fontSize: '14px' } }, + tooltip: { + formatter: function () { + var s = '<b>' + Highcharts.dateFormat('%Y/%m/%d', this.x) + '</b>'; + $.each(this.points, function () { + s += '<br/><b>' + this.series.name + '</b>: ' + + Highcharts.numberFormat(this.y / 1e15, 2, '.') + ' PB'; + }); + + return s; + }, + shared: true + }, + style: { + color: (Highcharts.theme && Highcharts.theme.contrastTextColor) || 'black', + fontFamily: '"Lucida Grande", "Lucida Sans Unicode", Verdana, Arial, Helvetica, sans-serif', // default font + fontSize: '22px' + }, + plotOptions: { + column: { + animation: false, + stacking:'normal', + pointPadding: 0.01, + groupPadding: 0.05, + pointPlacement: -0.45 + } + }, + series: {{quota_series|safe}} + }); }); + </script> <div id="usage_trend_container" style="min-width: 310px; min-height: 600px; width: 95%; height: 100%; margin: 24px; margin-left: auto; margin-right: auto; "></div> <div id="usage_deltas_container" style="min-width: 310px; min-height: 600px; width: 95%; height: 100%; margin: 24px; margin-left: auto; margin-right: auto; "></div> <div id="usage_piechart_container" style="min-width: 310px; min-height: 600px; width: 80%; height: 100%; margin: 24px; margin-left: auto; margin-right: auto; "></div> + <div style="overflow: hidden; "> + <div id="quotas_chart_container" style="float: left; min-width: 310px; min-height: 600px; width: 48%; height: 100%; margin: 24px; margin-left: auto; "></div> + <div style="float: left; min-width: 310px; min-height: 600px; width: 48%; height: 100%; margin: 24px; margin-right: auto; "> + <p>Latest quota and usages per site and tape quotable dirs</p> + {{site_tape_usages|safe}} + </div> + </div> Data last gathered at {{data_gathered_timestamp}}. </body> </html> diff --git a/LTA/ltastorageoverview/lib/webservice/webservice.py b/LTA/ltastorageoverview/lib/webservice/webservice.py index 96b267faf30..4b698294599 100755 --- a/LTA/ltastorageoverview/lib/webservice/webservice.py +++ b/LTA/ltastorageoverview/lib/webservice/webservice.py @@ -54,21 +54,36 @@ def index(): # TODO: serve html first, and let client request data via ajax usages = {} + colors = {'sara': {'used': '#90ed7d', 'free': '#c5f6bc'}, + 'juelich': {'used': '#494950', 'free': '#a1a1aa'}, + 'poznan': {'used': '#7cb5ec', 'free': '#bcdaf5'}} + sites = db.sites() + sitesDict = { s['name']:s for s in sites } + sites = [sitesDict[sitename] for sitename in ['poznan', 'juelich', 'sara'] if sitename in sitesDict] - total = 0.0 - numFiles = 0L + total_lta_size = 0.0 + total_lta_num_files = 0L for site in sites: - site_usage = float(db.totalFileSizeInSite(site['id'])) - usages[site['name']] = site_usage - total += site_usage - numFiles += db.numFilesInSite(site['id']) - - storagesitedata='[' + ', '.join(['''{name: "%s %s", y: %.1f}''' % (site['name'], humanreadablesize(usages[site['name']]), 100.0*usages[site['name']]/total) for site in sites]) + ']' + totals = db.totalFileSizeAndNumFilesInSite(site['id']) + total_lta_size += totals['tree_total_file_size'] + total_lta_num_files += totals['tree_num_files'] + usages[site['name']] = totals['tree_total_file_size'] + + if total_lta_size > 0: + storagesitedata='[' + ', '.join(['''{name: "%s %s", color:'%s', y: %.2f}''' % (site['name'], humanreadablesize(usages[site['name']]), + colors[site['name']]['used'], + 100.0*usages[site['name']]/total_lta_size) for site in sites]) + ']' + else: + storagesitedata ='[]' min_date, max_date = db.datetimeRangeOfFilesInTree() - min_date = datetime(2012, 1, 1) - month_ranges = monthRanges(min_date, max_date) + if min_date is None: + min_date = datetime(2012, 1, 1) + if max_date is None: + max_date = datetime.utcnow() + min_date = max(datetime(2012, 1, 1), min_date) + month_ranges = monthRanges(min_date, max_date, 3) # convert end-of-month timestamps to milliseconds since epoch epoch = datetime.utcfromtimestamp(0) @@ -77,30 +92,73 @@ def index(): usage_per_month_series='[' deltas_per_month_series='[' for site in sites: - cumulatives = [db.totalFileSizeInSite(site['id'], to_date=mr[1]) for mr in month_ranges] + deltas_per_month = [db.totalFileSizeInSite(site['id'], from_date=mr[0], to_date=mr[1]) for mr in month_ranges] + data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, deltas_per_month)]) + deltas_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % (site['name'], colors[site['name']]['used'], data) - data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, cumulatives)]) - usage_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site['name'], data) + cumulatives = [deltas_per_month[0]] + for delta in deltas_per_month[1:]: + cumulative = cumulatives[-1] + delta + cumulatives.append(cumulative) - deltas = [0] - for i in range(1, len(cumulatives)): - delta = cumulatives[i] - cumulatives[i-1] - deltas.append(delta) + data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, cumulatives)]) + usage_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % (site['name'], colors[site['name']]['used'], data) - data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, deltas)]) - deltas_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site['name'], data) usage_per_month_series+=']' deltas_per_month_series+=']' + quota_dir_stats = db.siteQuotaRootDirStats() + site_usages_per_site = {} + latest_usages_per_site = {} + for site_usage in db.siteQuotaUsages(): + site_name = site_usage['site_name'] + if site_name not in site_usages_per_site: + site_usages_per_site[site_name] = [] + site_usages_per_site[site_name].append(site_usage) + if site_name not in latest_usages_per_site: + latest_usages_per_site[site_name] = site_usage + if site_usage['valid_until_date'] > latest_usages_per_site[site_name]['valid_until_date']: + latest_usages_per_site[site_name] = site_usage + + + quota_series='[' + site_tape_usages_table = '<table>\n' + site_tape_usages_table += '<tr><th style="text-align: left;">site</th><th style="text-align: left;">directory</th><th>total #files</th><th>total file size</th><th>quota</th><th>free</th><th>expiration</th></tr>\n' + for site_name in ['sara','juelich', 'poznan']: + if site_name in latest_usages_per_site: + latest_usage = latest_usages_per_site[site_name] + site_tape_usages_table += '<tr style="font-weight: bold;"><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % ( + latest_usage['site_name'], '', latest_usage['num_files'], humanreadablesize(latest_usage['total_file_size']), humanreadablesize(latest_usage['quota']), humanreadablesize(latest_usage['space_left']), latest_usage['valid_until_date']) + + for qds in quota_dir_stats: + if qds['site_name'] == site_name: + site_tape_usages_table += '<tr><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % ( + '', qds['dir_name'], qds['tree_num_files'], humanreadablesize(qds['tree_total_file_size']), '', '', '') + + site_tape_usages_table += '</table>\n' + + for site_name in ['poznan','juelich', 'sara']: + if site_name in site_usages_per_site: + site_usages_for_site = site_usages_per_site[site_name] + site_usages_for_site = sorted(site_usages_for_site, key=lambda x: x['valid_until_date']) + data = ','.join('[%d, %s]' % ((su['valid_until_date'] - epoch).total_seconds()*1000, su['space_left']) for su in site_usages_for_site) + quota_series+='''{ name:'%s_free', stack:'%s', color:'%s', data:[%s] },''' % (site_name,site_name,colors[site_name]['free'],data) + data = ','.join('[%d, %s]' % ((su['valid_until_date'] - epoch).total_seconds()*1000, su['total_file_size']) for su in site_usages_for_site) + quota_series+='''{ name:'%s_used', stack:'%s', color:'%s', data:[%s] },''' % (site_name,site_name,colors[site_name]['used'], data) + + quota_series+=']' + return render_template('index.html', title='LTA storage overview', storagesitetitle='LTA Storage Site Usage', - storagesitesubtitle='Total: %s #dataproducts: %s' % (humanreadablesize(total, 'B', 1000), humanreadablesize(numFiles, '', 1000)), + storagesitesubtitle='Total: %s #dataproducts: %s' % (humanreadablesize(total_lta_size, 'B', 1000), humanreadablesize(total_lta_num_files, '', 1000)), storagesitedata=storagesitedata, usage_per_month_series=usage_per_month_series, deltas_per_month_series=deltas_per_month_series, + quota_series=quota_series, + site_tape_usages=site_tape_usages_table, data_gathered_timestamp=db.mostRecentVisitDate().strftime('%Y/%m/%d %H:%M:%S')) @app.route('/rest/sites/') diff --git a/LTA/ltastorageoverview/test/db_performance_test.py b/LTA/ltastorageoverview/test/db_performance_test.py index 304bd868642..b603cec0881 100755 --- a/LTA/ltastorageoverview/test/db_performance_test.py +++ b/LTA/ltastorageoverview/test/db_performance_test.py @@ -18,8 +18,9 @@ # with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. import logging -import time from datetime import datetime, timedelta +import os + from lofar.lta.ltastorageoverview import store from lofar.common.datetimeutils import totalSeconds @@ -30,7 +31,7 @@ def main(): from lofar.common import dbcredentials # Check the invocation arguments - parser = OptionParser("%prog [options]", description='runs the lta scraper and stores results in the speficied database.') + parser = OptionParser("%prog [options]", description='wipes the lta storageoverview database, and executes a performance test by inserting many files.') parser.add_option('-V', '--verbose', dest='verbose', action='store_true', help='verbose logging') parser.add_option_group(dbcredentials.options_group(parser)) parser.set_defaults(dbcredentials="LTASO") @@ -43,22 +44,44 @@ def main(): logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword()) + base_date = datetime.utcnow() + db = store.LTAStorageDb(dbcreds, options.verbose) - db.insertSite('siteA', 'srm://srm.siteA.nl:8444') - rootdir_id = db.insertRootDirectory('siteA', '/pnfs/grid.siteA.nl/data/lofar/ops/projects/lc1_001') + if raw_input('Are you sure you want to wipe the database? <y>/<n>:').lower() not in ['yes', 'y']: + return + + db.executeQuery('TRUNCATE lta.site CASCADE;', store.FETCH_NONE) + db.insertSite('sara', 'srm://srm.siteA.nl:8444') + rootdir_id = db.insertRootDirectory('sara', '/pnfs/grid.siteA.nl/data/lofar/ops') + projects_dir_id = db.insertSubDirectory(rootdir_id, '/pnfs/grid.siteA.nl/data/lofar/ops/projects') + + # junk_dir_id = db.insertSubDirectory(rootdir_id, '/pnfs/grid.siteA.nl/data/lofar/ops/misc') + # fileinfos = [('testfile%d' % (i,), i+100, base_date, junk_dir_id) for i in range(0, 43)] + # file_ids = db.insertFileInfos(fileinfos) + # total_num_files_inserted = len(file_ids) + total_num_files_inserted = 0 - for obsId in range(100000, 100100): - obsName = 'L%s' % obsId + with open('db_perf.csv', 'w') as file: + for cycle_nr in range(1, 10): + for project_nr in range(1, 10): + # project_name = 'lc%d_%03d/%d' % (cycle_nr, project_nr, os.getpid()) + project_name = 'lc%d_%03d' % (cycle_nr, project_nr) + projectdir_id = db.insertSubDirectory(projects_dir_id, '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s' % (project_name,)) - obsdir_id = db.insertSubDirectory(rootdir_id, '/pnfs/grid.siteA.nl/data/lofar/ops/projects/lc1_001/%s' % obsId) + obs_base_id = cycle_nr*100000+project_nr*1000 + for obs_nr, obsId in enumerate(range(obs_base_id, obs_base_id+20)): + obsName = 'L%s' % obsId - now = datetime.utcnow() - fileinfos = [('%s_SB%3d' % (obsName, sbNr), 1234, now, obsdir_id) for sbNr in range(0, 488)] - now = datetime.utcnow() - db.insertFileInfos(fileinfos) - elapsed = totalSeconds(datetime.utcnow() - now) - print '%s,%s' % (db.numFilesInTree(rootdir_id), elapsed) + obsdir_id = db.insertSubDirectory(projectdir_id, '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s/%s' % (project_name, obsName)) + fileinfos = [('%s_SB%3d' % (obsName, sbNr), 1000+sbNr+project_nr*cycle_nr, base_date + timedelta(days=10*cycle_nr+project_nr, minutes=obs_nr, seconds=sbNr), obsdir_id) for sbNr in range(0, 2)] + now = datetime.utcnow() + file_ids = db.insertFileInfos(fileinfos) + total_num_files_inserted += len(file_ids) + elapsed = totalSeconds(datetime.utcnow() - now) + line = '%s,%s' % (total_num_files_inserted, elapsed) + print line + file.write(line + '\n') if __name__ == "__main__": main() diff --git a/LTA/ltastorageoverview/test/test_store.py b/LTA/ltastorageoverview/test/test_store.py index c3529c7a3f0..9fe0fdd67b7 100755 --- a/LTA/ltastorageoverview/test/test_store.py +++ b/LTA/ltastorageoverview/test/test_store.py @@ -21,152 +21,331 @@ import unittest import logging -import datetime +from datetime import datetime, timedelta import time -import os -import os.path -from lofar.common import dbcredentials +import os, os.path +import psycopg2 +from pprint import * +import lofar.common.dbcredentials as dbc from lofar.lta.ltastorageoverview import store +from lofar.common.postgres import FETCH_NONE,FETCH_ONE,FETCH_ALL + +try: + import testing.postgresql +except ImportError as e: + print str(e) + print 'Please install python package testing.test_psql: sudo pip install testing.test_psql' + exit(3) # special lofar test exit code: skipped test logger = logging.getLogger(__name__) class TestLTAStorageDb(unittest.TestCase): def setUp(self): - dbcreds = dbcredentials.DBCredentials().get('LTASO') - logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword()) - - self.db = store.LTAStorageDb(dbcreds, True) - self.db.executeQuery('delete from lta.fileinfo;') - self.db.executeQuery('delete from lta.directory;') - self.db.executeQuery('delete from lta.directory_closure;') - self.db.executeQuery('delete from lta.storage_site;') - self.db.commit() - - #def tearDown(self): - #if os.path.exists(self.db.db_filename): - #os.remove(self.db.db_filename) - - def testSites(self): - siteA_id = self.db.insertSite('siteA', 'srm://siteA.org') - siteB_id = self.db.insertSite('siteB', 'srm://siteB.org') - - sites = self.db.sites() - siteNames = [x['name'] for x in sites] - self.assertEquals(2, len(siteNames)) - self.assertTrue('siteA' in siteNames) - self.assertTrue('siteB' in siteNames) - - site = self.db.site(siteA_id) - self.assertEqual('siteA', site['name']) - - site = self.db.site(siteB_id) - self.assertEqual('siteB', site['name']) - - def testRootDirs(self): - siteA_id = self.db.insertSite('siteA', 'srm://siteA.org') - siteB_id = self.db.insertSite('siteB', 'srm://siteB.org') + logger.info('setting up test LTASO database server...') - dirA1_id = self.db.insertRootDirectory('siteA', 'rootDir1') - dirA2_id = self.db.insertRootDirectory('siteA', 'rootDir2') - dirA3_id = self.db.insertRootDirectory('siteA', 'path/to/rootDir3') + # create a test db + logger.info(' creating test postgres server') + self.test_psql = testing.postgresql.Postgresql() + dsn = self.test_psql.dsn() + logger.info(' created test postgres server, dsn=%s', dsn) - dirB1_id = self.db.insertRootDirectory('siteB', 'rootDir1') - dirB2_id = self.db.insertRootDirectory('siteB', 'path/to/otherRootDir') + self.dbcreds = dbc.DBCredentials().get('LTASO') - rootDirs = self.db.rootDirectories() - self.assertEquals(5, len(rootDirs)) + if not self.dbcreds.password: + logger.warn('could not find lofar dbcredentials for ltaso test, please create a dbcreds file at ~/.lofar/dbcredentials/ltaso.ini with the proper credentials.') + exit(3) # special lofar test exit code: skipped test - rootDirsDict = {rd['dir_id']:rd for rd in rootDirs} + with psycopg2.connect(**dsn) as conn: + cursor = conn.cursor() + #use same user/pass as stored in local dbcreds + query = "CREATE USER %s WITH SUPERUSER PASSWORD '%s'" % (self.dbcreds.user, self.dbcreds.password) + cursor.execute(query) - self.assertEqual('rootDir1', rootDirsDict[dirA1_id]['dir_name']) - self.assertEqual(siteA_id, rootDirsDict[dirA1_id]['site_id']) - self.assertEqual('siteA', rootDirsDict[dirA1_id]['site_name']) + create_script_path = os.path.normpath(os.path.join(os.environ['LOFARROOT'], 'share', 'ltaso', 'create_db_ltastorageoverview.sql')) + logger.info(' running ltaso create script create_script=%s', create_script_path) + with open(create_script_path, 'r') as script: + cursor.execute(script.read()) + logger.info(' completed ltaso create script') - self.assertEqual('rootDir2', rootDirsDict[dirA2_id]['dir_name']) - self.assertEqual(siteA_id, rootDirsDict[dirA2_id]['site_id']) - self.assertEqual('siteA', rootDirsDict[dirA2_id]['site_name']) + # copy the test postgres server settings into dbcreds + # we can use these dbcreds in each test method to connect to the testing ltaso database + self.dbcreds.host = dsn['host'] + self.dbcreds.database = dsn['database'] + self.dbcreds.port = dsn['port'] - self.assertEqual('path/to/rootDir3', rootDirsDict[dirA3_id]['dir_name']) - self.assertEqual(siteA_id, rootDirsDict[dirA3_id]['site_id']) - self.assertEqual('siteA', rootDirsDict[dirA3_id]['site_name']) + logger.info('finished setting up test LTASO database') - self.assertEqual('rootDir1', rootDirsDict[dirB1_id]['dir_name']) - self.assertEqual(siteB_id, rootDirsDict[dirB1_id]['site_id']) - self.assertEqual('siteB', rootDirsDict[dirB1_id]['site_name']) + def tearDown(self): + logger.info('removing test LTASO database server...') + self.test_psql.stop() + logger.info('removed test LTASO database server') - self.assertEqual('path/to/otherRootDir', rootDirsDict[dirB2_id]['dir_name']) - self.assertEqual(siteB_id, rootDirsDict[dirB2_id]['site_id']) - self.assertEqual('siteB', rootDirsDict[dirB2_id]['site_name']) - - def testDirectoryTrees(self): - siteA_id = self.db.insertSite('siteA', 'srm://siteA.org') - siteB_id = self.db.insertSite('siteB', 'srm://siteB.org') - - for i in range(2): - rootDir_id = self.db.insertRootDirectory('siteA', 'rootDir_%d' % i) + def testSites(self): + with store.LTAStorageDb(self.dbcreds, True) as db: + siteA_id = db.insertSite('siteA', 'srm://siteA.org') + siteB_id = db.insertSite('siteB', 'srm://siteB.org') - for j in range(2): - subDir_id = self.db.insertSubDirectory(rootDir_id, 'subDir_%d' % j) - self.db.insertFileInfo('file_%d' % j, 271*(j+1), datetime.datetime.utcnow(), subDir_id) + sites = db.sites() + siteNames = [x['name'] for x in sites] + self.assertEquals(2, len(siteNames)) + self.assertTrue('siteA' in siteNames) + self.assertTrue('siteB' in siteNames) - for k in range(2): - subsubDir_id = self.db.insertSubDirectory(subDir_id, 'subsubDir_%d' % k) - self.db.insertFileInfo('file_%d_%d' % (j,k), 314*(k+1), datetime.datetime.utcnow(), subsubDir_id) + site = db.site(siteA_id) + self.assertEqual('siteA', site['name']) - rootDirs = self.db.rootDirectories() - self.assertEquals(2, len(rootDirs)) + site = db.site(siteB_id) + self.assertEqual('siteB', site['name']) - for id in [x['dir_id'] for x in rootDirs]: - subDirs = self.db.subDirectories(id, 1, False) - for subDir in subDirs: - subDir_parent_id = subDir['parent_dir_id'] - self.assertEquals(id, subDir_parent_id) + def testRootDirs(self): + with store.LTAStorageDb(self.dbcreds, True) as db: + siteA_id = db.insertSite('siteA', 'srm://siteA.org') + siteB_id = db.insertSite('siteB', 'srm://siteB.org') + + dirA1_id = db.insertRootDirectory('siteA', 'rootDir1') + dirA2_id = db.insertRootDirectory('siteA', 'rootDir2') + dirA3_id = db.insertRootDirectory('siteA', 'path/to/rootDir3') + + dirB1_id = db.insertRootDirectory('siteB', 'rootDir1') + dirB2_id = db.insertRootDirectory('siteB', 'path/to/otherRootDir') + + rootDirs = db.rootDirectories() + self.assertEquals(5, len(rootDirs)) + + rootDirsDict = {rd['root_dir_id']:rd for rd in rootDirs} + + self.assertEqual('rootDir1', rootDirsDict[dirA1_id]['dir_name']) + self.assertEqual(siteA_id, rootDirsDict[dirA1_id]['site_id']) + self.assertEqual('siteA', rootDirsDict[dirA1_id]['site_name']) + + self.assertEqual('rootDir2', rootDirsDict[dirA2_id]['dir_name']) + self.assertEqual(siteA_id, rootDirsDict[dirA2_id]['site_id']) + self.assertEqual('siteA', rootDirsDict[dirA2_id]['site_name']) + + self.assertEqual('path/to/rootDir3', rootDirsDict[dirA3_id]['dir_name']) + self.assertEqual(siteA_id, rootDirsDict[dirA3_id]['site_id']) + self.assertEqual('siteA', rootDirsDict[dirA3_id]['site_name']) + + self.assertEqual('rootDir1', rootDirsDict[dirB1_id]['dir_name']) + self.assertEqual(siteB_id, rootDirsDict[dirB1_id]['site_id']) + self.assertEqual('siteB', rootDirsDict[dirB1_id]['site_name']) + + self.assertEqual('path/to/otherRootDir', rootDirsDict[dirB2_id]['dir_name']) + self.assertEqual(siteB_id, rootDirsDict[dirB2_id]['site_id']) + self.assertEqual('siteB', rootDirsDict[dirB2_id]['site_name']) + + def testDirectoryTreesAndStats(self): + with store.LTAStorageDb(self.dbcreds, True) as db: + base_time = datetime.utcnow() + base_time -= timedelta(seconds=base_time.second, microseconds=base_time.microsecond) + + for site_nr in range(2): + site_name = 'site%d' % site_nr + site_url = 'srm://%s.org' % site_name + site_id = db.insertSite(site_name, site_url) + + dir2subdir = {} + + for project_nr in range(3): + rootDir_id = db.insertRootDirectory(site_name, 'rootDir_%d' % project_nr) + dir2subdir[rootDir_id] = [] + + for subdir_nr in range(4): + subDir_id = db.insertSubDirectory(rootDir_id, 'subDir_%d' % subdir_nr) + dir2subdir[subDir_id] = [] + dir2subdir[rootDir_id].append(subDir_id) + for file_nr in range(project_nr*subdir_nr): + db.insertFileInfo('file_%d' % file_nr, 271*(file_nr+1), base_time + timedelta(days=10*site_nr+project_nr, hours=subdir_nr, seconds=file_nr), subDir_id) + + dir_files = db.filesInDirectory(subDir_id) + dir_stats = db.directoryTreeStats(subDir_id) + + self.assertEqual(sum(f['size'] for f in dir_files), dir_stats['dir_total_file_size']) + self.assertEqual(len(dir_files), dir_stats['dir_num_files']) + if dir_files: + self.assertEqual(min(f['size'] for f in dir_files), dir_stats['dir_min_file_size']) + self.assertEqual(max(f['size'] for f in dir_files), dir_stats['dir_max_file_size']) + self.assertEqual(min(f['creation_date'] for f in dir_files), dir_stats['dir_min_file_creation_date']) + self.assertEqual(max(f['creation_date'] for f in dir_files), dir_stats['dir_max_file_creation_date']) + + for subsubdir_nr in range(5): + subsubDir_id = db.insertSubDirectory(subDir_id, 'subsubDir_%d' % subsubdir_nr) + dir2subdir[subsubDir_id] = [] + dir2subdir[subDir_id].append(subsubDir_id) + for kk in range(project_nr*subdir_nr*subsubdir_nr): + db.insertFileInfo('file_%d_%d' % (subdir_nr,kk), 314*(kk+1), base_time + timedelta(days=10*site_nr+project_nr, hours=10*subdir_nr+subsubdir_nr+2, seconds=kk), subsubDir_id) + + dir_files = db.filesInDirectory(subsubDir_id) + dir_stats = db.directoryTreeStats(subsubDir_id) + + self.assertEqual(sum(f['size'] for f in dir_files), dir_stats['dir_total_file_size']) + self.assertEqual(len(dir_files), dir_stats['dir_num_files']) + if dir_files: + self.assertEqual(min(f['size'] for f in dir_files), dir_stats['dir_min_file_size']) + self.assertEqual(max(f['size'] for f in dir_files), dir_stats['dir_max_file_size']) + self.assertEqual(min(f['creation_date'] for f in dir_files), dir_stats['dir_min_file_creation_date']) + self.assertEqual(max(f['creation_date'] for f in dir_files), dir_stats['dir_max_file_creation_date']) + + tree_totals = db.totalFileSizeAndNumFilesInTree(subDir_id, dir_stats['dir_min_file_creation_date'], dir_stats['dir_max_file_creation_date']) + self.assertEqual(tree_totals['tree_num_files'], dir_stats['dir_num_files']) + self.assertEqual(tree_totals['tree_total_file_size'], dir_stats['dir_total_file_size']) + + # test 1st level subdir again, and also check inclusion of 2nd level subdirs in tree stats + dir_files = db.filesInDirectory(subDir_id) + dir_stats = db.directoryTreeStats(subDir_id) + # this dir only... + self.assertEqual(sum(f['size'] for f in dir_files), dir_stats['dir_total_file_size']) + self.assertEqual(len(dir_files), dir_stats['dir_num_files']) + if dir_files: + self.assertEqual(min(f['size'] for f in dir_files), dir_stats['dir_min_file_size']) + self.assertEqual(max(f['size'] for f in dir_files), dir_stats['dir_max_file_size']) + self.assertEqual(min(f['creation_date'] for f in dir_files), dir_stats['dir_min_file_creation_date']) + self.assertEqual(max(f['creation_date'] for f in dir_files), dir_stats['dir_max_file_creation_date']) + + # including subdirs in tree... + self.assertEqual(sum(f['file_size'] for f in db.filesInTree(subDir_id)), dir_stats['tree_total_file_size']) + self.assertEqual(len(db.filesInTree(subDir_id)), dir_stats['tree_num_files']) + + rootDirs = db.rootDirectoriesForSite(site_id) + self.assertEquals(3, len(rootDirs)) + + for root_dir_id in [x['dir_id'] for x in rootDirs]: + subDirs = db.subDirectories(root_dir_id, 1, False) + for subDir in subDirs: + subDir_parent_id = subDir['parent_dir_id'] + self.assertEquals(root_dir_id, subDir_parent_id) + self.assertTrue(subDir['id'] in dir2subdir[root_dir_id]) + + subsubDirs = db.subDirectories(subDir['id'], 1, False) + for subsubDir in subsubDirs: + subsubDir_parent_id = subsubDir['parent_dir_id'] + self.assertEquals(subDir['id'], subsubDir_parent_id) + self.assertTrue(subsubDir['id'] in dir2subdir[subDir['id']]) + + # check various selects of files in the tree, for each file + tree_files = sorted(db.filesInTree(root_dir_id), key=lambda f: f['file_creation_date']) + for file in tree_files: + # check if filesInTree return this one file when time delimited for this specific file_creation_date + file_creation_date = file['file_creation_date'] + selected_tree_files = db.filesInTree(root_dir_id, file_creation_date, file_creation_date) + self.assertEqual(1, len(selected_tree_files)) + self.assertEqual(file['file_creation_date'], selected_tree_files[0]['file_creation_date']) + self.assertEqual(file['file_size'], selected_tree_files[0]['file_size']) + + # get the 'totals' for this root_dir, but select only this file by date. + # should return 1 file. + tree_totals = db.totalFileSizeAndNumFilesInTree(root_dir_id, file_creation_date, file_creation_date) + self.assertEqual(1, tree_totals['tree_num_files']) + self.assertEqual(file['file_size'], tree_totals['tree_total_file_size']) + + # check some ranges files/times + for idx, file in enumerate(tree_files): + file_creation_date = file['file_creation_date'] + + #select any file >= file_creation_date + expected_selected_tree_files = tree_files[idx:] + selected_tree_files = db.filesInTree(root_dir_id, file_creation_date, None) + self.assertEqual(len(expected_selected_tree_files), len(selected_tree_files)) + selected_tree_files_ids = set([f['file_id'] for f in selected_tree_files]) + for expected_file in expected_selected_tree_files: + self.assertTrue(expected_file['file_id'] in selected_tree_files_ids) + + # and check the totals as well + tree_totals = db.totalFileSizeAndNumFilesInTree(root_dir_id, file_creation_date, None) + self.assertEqual(len(expected_selected_tree_files), tree_totals['tree_num_files']) + self.assertEqual(sum(f['file_size'] for f in expected_selected_tree_files), tree_totals['tree_total_file_size']) + + #select any file <= file_creation_date + expected_selected_tree_files = tree_files[:idx+1] + selected_tree_files = db.filesInTree(root_dir_id, None, file_creation_date) + self.assertEqual(len(expected_selected_tree_files), len(selected_tree_files)) + selected_tree_files_ids = set([f['file_id'] for f in selected_tree_files]) + for expected_file in expected_selected_tree_files: + self.assertTrue(expected_file['file_id'] in selected_tree_files_ids) + + # and check the totals as well + tree_totals = db.totalFileSizeAndNumFilesInTree(root_dir_id, None, file_creation_date) + self.assertEqual(len(expected_selected_tree_files), tree_totals['tree_num_files']) + self.assertEqual(sum(f['file_size'] for f in expected_selected_tree_files), tree_totals['tree_total_file_size']) - print '\n'.join([str(x) for x in self.db.filesInTree(rootDir_id)]) def testLeastRecentlyVisitedDirectory(self): - siteA_id = self.db.insertSite('siteA', 'srm://siteA.org') + with store.LTAStorageDb(self.dbcreds, True) as db: + db.insertSite('siteA', 'srm://siteA.org') - dir_ids = [] - for i in range(3): - dir_id = self.db.insertRootDirectory('siteA', 'rootDir_%d' % i) - dir_ids.append(dir_id) + dir_ids = [] + for i in range(3): + dir_id = db.insertRootDirectory('siteA', 'rootDir_%d' % i) + dir_ids.append(dir_id) - self.db.updateDirectoryLastVisitTime(dir_id, datetime.datetime.utcnow()) - time.sleep(0.002) + db.updateDirectoryLastVisitTime(dir_id, datetime.utcnow()) + time.sleep(0.002) - visitStats = self.db.visitStats() - self.assertTrue('siteA' in visitStats) - self.assertTrue('least_recent_visited_dir_id' in visitStats['siteA']) + visitStats = db.visitStats() + self.assertTrue('siteA' in visitStats) + self.assertTrue('least_recent_visited_dir_id' in visitStats['siteA']) - lvr_dir_id = visitStats['siteA']['least_recent_visited_dir_id'] - self.assertEquals(dir_ids[0], lvr_dir_id) + lvr_dir_id = visitStats['siteA']['least_recent_visited_dir_id'] + self.assertEquals(dir_ids[0], lvr_dir_id) - self.db.updateDirectoryLastVisitTime(dir_ids[0], datetime.datetime.utcnow()) - self.db.updateDirectoryLastVisitTime(dir_ids[1], datetime.datetime.utcnow()) + db.updateDirectoryLastVisitTime(dir_ids[0], datetime.utcnow()) + db.updateDirectoryLastVisitTime(dir_ids[1], datetime.utcnow()) - visitStats = self.db.visitStats() - lvr_dir_id = visitStats['siteA']['least_recent_visited_dir_id'] - self.assertEquals(dir_ids[2], lvr_dir_id) + visitStats = db.visitStats() + lvr_dir_id = visitStats['siteA']['least_recent_visited_dir_id'] + self.assertEquals(dir_ids[2], lvr_dir_id) def testDuplicateSubDirs(self): - siteA_id = self.db.insertSite('siteA', 'srm://siteA.org') - siteB_id = self.db.insertSite('siteB', 'srm://siteB.org') - - dirA_id = self.db.insertRootDirectory('siteA', 'rootDir1') - dirB_id = self.db.insertRootDirectory('siteB', 'rootDir1') - - subDirA1_id = self.db.insertSubDirectory(dirA_id, 'foo') - subDirA2_id = self.db.insertSubDirectory(dirA_id, 'bar') - subDirB1_id = self.db.insertSubDirectory(dirB_id, 'foo') - - self.assertNotEquals(None, subDirA1_id) - self.assertNotEquals(None, subDirA2_id) - self.assertNotEquals(None, subDirB1_id) - - subDirA1a_id = self.db.insertSubDirectory(dirA_id, 'foo') - self.assertEquals(None, subDirA1a_id) - + with store.LTAStorageDb(self.dbcreds, True) as db: + db.insertSite('siteA', 'srm://siteA.org') + db.insertSite('siteB', 'srm://siteB.org') + + dirA_id = db.insertRootDirectory('siteA', 'rootDir1') + dirB_id = db.insertRootDirectory('siteB', 'rootDir1') + + subDirA1_id = db.insertSubDirectory(dirA_id, 'foo') + subDirA2_id = db.insertSubDirectory(dirA_id, 'bar') + subDirB1_id = db.insertSubDirectory(dirB_id, 'foo') + + self.assertNotEquals(None, subDirA1_id) + self.assertNotEquals(None, subDirA2_id) + self.assertNotEquals(None, subDirB1_id) + + subDirA1a_id = db.insertSubDirectory(dirA_id, 'foo') + self.assertEquals(None, subDirA1a_id) + + def testDirectoryTreesAndStats(self): + with store.LTAStorageDb(self.dbcreds, True) as db: + #first insert a lot of data... + db.insertSite('juelich', 'srm://lofar-srm.fz-juelich.de:8443') + db.insertSite('sara', 'srm://srm.grid.sara.nl:8443') + + juelich_root_dir_id = db.insertRootDirectory('juelich', '/pnfs/fz-juelich.de/data/lofar/ops/') + sara_root_dir_id = db.insertRootDirectory('sara', '/pnfs/grid.sara.nl/data/lofar/ops') + + juelich_projects_dir_id = db.insertSubDirectory(juelich_root_dir_id, '/pnfs/fz-juelich.de/data/lofar/ops/projects') + sara_projects_dir_id = db.insertSubDirectory(sara_root_dir_id, '/pnfs/grid.sara.nl/data/lofar/ops/projects') + + for project_nr, project_name in enumerate(['lc8_001', '2017lofarobs', 'ddt5_001']): + # projects are sometimes stored at multiple sites + for projects_dir_id in [juelich_projects_dir_id, sara_projects_dir_id]: + project_dir_id = db.insertSubDirectory(projects_dir_id, + '/pnfs/fz-juelich.de/data/lofar/ops/projects/' + project_name) + for obs_nr in range(3): + obs_name = 'L%06d' % ((project_nr+1)*1000 + obs_nr) + obs_dir_id = db.insertSubDirectory(project_dir_id, + '/pnfs/fz-juelich.de/data/lofar/ops/projects/' + project_name + '/' + obs_name) + + for sb_nr in range(244): + file_name = '%s_SB%03d.MS.tar' % (obs_name, sb_nr) + db.insertFileInfo(file_name, 1, datetime.utcnow(), obs_dir_id, False) + db.commit() + + # then check the results + # TODO check the results + logger.info(pformat(db.executeQuery('select * from metainfo.project_directory', fetch=FETCH_ALL))) + logger.info(pformat(db.executeQuery('select * from metainfo.project_stats', fetch=FETCH_ALL))) + logger.info(pformat(db.executeQuery('select * from metainfo.project_observation_dataproduct', fetch=FETCH_ALL))) # run tests if main if __name__ == '__main__': -- GitLab