From af74a23169ae651ccba20e565876752244a2cc69 Mon Sep 17 00:00:00 2001
From:  <>
Date: Fri, 13 Sep 2024 16:33:03 +0000
Subject: [PATCH] Deployed 8419a58a with MkDocs version: 1.1.2

---
 index.html               |   1 +
 search/search_index.json |   2 +-
 sitemap.xml              |  60 +++++++++++++++++++--------------------
 sitemap.xml.gz           | Bin 510 -> 510 bytes
 4 files changed, 32 insertions(+), 31 deletions(-)
diff --git a/index.html b/index.html
index c927e02e..76e27baa 100644
--- a/index.html
+++ b/index.html
@@ -844,6 +844,7 @@ <h2 id="weekly-operations-meetings">Weekly Operations Meetings</h2>
 <p><strong>Meeting ID:</strong> 183 382 852 (password required; available on request)</p>
 <h3 id="meeting-minutes">Meeting Minutes</h3>
 <ul>
+<li><a href="https://docs.google.com/document/d/1u2ZohGx4Zan7hZMteZ_b2sRyW1FOXUp4Nnd5VD_Iigo/edit?usp=sharing">September 13, 2024</a></li>
 <li><a href="https://docs.google.com/document/d/1PHQMMRU3Z4qCspUDSo13tMUfKlx9x91WSftalWt5L-Q/edit?usp=sharing">September 6, 2024</a></li>
 <li><a href="https://docs.google.com/document/d/12swsTPh36nLtlQcuvxBg7pHeMnJ8JQEGQac6FGXrnoQ/edit?usp=sharing">August 23, 2024</a></li>
 <li><a href="https://docs.google.com/document/d/1aCHXFvjRkvHXZNbR_-7H_-1viAUww9LjFJWOnyY5zp4/edit?usp=sharing">August 16, 2024</a></li>
diff --git a/search/search_index.json b/search/search_index.json
index 94f83065..fe361d8b 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"OSG Operations Welcome to the home page of the OSG Operations Team documentation area! Mission The mission of OSG Operations is to maintain and improve distributed high throughput computing services to support research communities. This is accomplished by: Operating and maintaining our services in a user-oriented, robust, and reliable manner. Developing a professional and skilled staff dedicated to a service philosophy. Managing resources responsibly, efficiently, and with accountability. Evaluating and continually improving the actions, methods and processes that allow the OSG to operate. Contact Us Open a Ticket Slack channel - if you can't create an account, send an e-mail to help@opensciencegrid.org Email: help@opensciencegrid.org Registration (Contact, Resource, VO, or Project) Register with OSG Weekly Operations Meetings When: Fridays 12:30 pm Central URL: https://unl.zoom.us/j/183382852 Phone: +1 669 900 6833 or +1 408 638 0968 or +1 646 876 9923 Meeting ID: 183 382 852 (password required; available on request) Meeting Minutes September 6, 2024 August 23, 2024 August 16, 2024 August 9, 2024 August 2, 2024 July 26, 2024 July 19, 2024 July 5, 2024 June 28, 2024 June 21, 2024 June 14, 2024 June 7, 2024 May 31, 2024 May 24, 2024 May 17, 2024 May 10, 2024 May 3, 2024 April 26, 2024 April 19, 2024 April 12, 2024 April 5, 2024 March 29, 2024 (canceled) March 22, 2024 March 15, 2024 March 8, 2024 March 1, 2024 February 23, 2024 February 16, 2024 February 9, 2024 February 2, 2024 January 26, 2024 January 19, 2024 January 12, 2024 January 5, 2024 December 29, 2023 (canceled) December 22, 2023 (canceled) December 15, 2023 December 8, 2023 December 1, 2023 November 24, 2023 (canceled) November 17, 2023 November 10, 2023 November 3, 2023 October 27, 2023 October 20, 2023 October 13, 2023 October 6, 2023 September 29, 2023 September 22, 2023 September 15, 2023 September 8, 2023 September 1, 2023 August 25, 2023 August 18, 2023 August 11, 2023 August 4, 2023 July 28, 2023 July 21, 2023 January 14, 2023 (canceled due to Throughput Computing 23) July 7, 2023 June 30, 2023 June 23, 2023 June 16, 2023 June 9, 2023 June 2, 2023 May 26, 2023 May 19, 2023 May 12, 2023 May 5, 2023 April 28, 2023 April 21, 2023 April 14, 2023 April 7, 2023 March 31, 2023 March 24, 2023 March 17, 2023 March 10, 2023 March 3, 2023 February 24, 2023 February 17, 2023 February 10, 2023 February 3, 2023 January 27, 2023 January 20, 2023 January 13, 2023 January 6, 2023 (canceled) December 30, 2022 (canceled) December 23, 2022 (canceled) December 16, 2022 December 9, 2022 December 2, 2022 November 25, 2022 (canceled) November 18, 2022 November 11, 2022 November 4, 2022 October 28, 2022 October 21, 2022 October 14, 2022 October 7, 2022 September 30, 2022 September 23, 2022 (canceled) September 16, 2022 (canceled) September 9, 2022 September 2, 2022 August 26, 2022 August 19, 2022 August 12, 2022 August 5, 2022 (canceled) July 29, 2022 July 22, 2022 (canceled) July 15, 2022 July 8, 2022 July 1, 2022 June 24, 2022 June 17, 2022 June 10, 2022 June 3, 2022 May 27, 2022 May 20, 2022 May 13, 2022 May 6, 2022 (canceled) April 29, 2022 April 22, 2022 April 15, 2022 April 8, 2022 April 1, 2022 March 25, 2022 March 18, 2022 (canceled) March 11, 2022 March 4, 2022 February 25, 2022 February 18, 2022 February 11, 2022 February 4, 2022 January 28, 2022 January 21, 2022 January 14, 2022 January 7, 2022 December 31, 2021 (canceled) December 24, 2021 (canceled) December 17, 2021 December 10, 2021 December 3, 2021 November 26, 2021 (canceled) November 19, 2021 November 12, 2021 November 5, 2021 October 29, 2021 October 22, 2021 October 15, 2021 (canceled) October 8, 2021 October 1, 2021 September 24, 2021 September 17, 2021 September 10, 2021 September 3, 2021 August 27, 2021 August 20, 2021 August 13, 2021 August 6, 2021 July 30, 2021 July 23, 2021 July 16, 2021 July 9, 2021 July 2, 2021 June 25, 2021 June 18, 2021 June 11, 2021 June 4, 2021 May 28, 2021 May 21, 2021 May 14, 2021 May 7, 2021 April 30, 2021 April 23, 2021 April 16, 2021 April 9, 2021 April 2, 2021 March 26, 2021 March 19, 2021 March 12, 2021 March 5, 2021 (canceled) February 26, 2021 February 19, 2021 February 12, 2021 February 5, 2021 January 29, 2021 January 22, 2021 January 15, 2021 January 8, 2021 January 1, 2021 (canceled) December 25, 2020 (canceled) December 18, 2020 December 11, 2020 December 4, 2020 November 20, 2020 November 13, 2020 November 6, 2020 October 30, 2020 October 23, 2020 October 16, 2020 October 9, 2020 October 2, 2020 September 25, 2020 September 18, 2020 September 11, 2020 September 4, 2020 (canceled) August 28, 2020 August 21, 2020 August 14, 2020 August 7, 2020 July 31, 2020 July 24, 2020 July 17, 2020 July 10, 2020 July 3, 2020 (canceled) June 26, 2020 June 19, 2020 June 12, 2020 June 5, 2020 May 29, 2020 (canceled) May 22, 2020 May 15, 2020 May 8, 2020 May 1, 2020 April 24, 2020 April 17, 2020 April 10, 2020 April 3, 2020 March 27, 2020 March 20, 2020 March 13, 2020 March 6, 2020 February 28, 2020 February 21, 2020 February 14, 2020 February 7, 2020 January 31, 2020 January 24, 2020 January 17, 2020 January 10, 2020 January 3, 2020 December 27, 2019 December 20, 2019 December 13, 2019 December 6, 2019 November 29, 2019 (canceled) November 22, 2019 November 15, 2019 November 8, 2019 November 1, 2019 October 25, 2019 October 18, 2019 October 11, 2019 October 4, 2019 September 27, 2019 September 20, 2019 September 13, 2019 September 6, 2019 August 30, 2019 August 23, 2019 August 16, 2019 August 9, 2019 August 2, 2019 July 26, 2019 July 19, 2019 July 12, 2019 July 8, 2019 July 1, 2019 June 24, 2019 June 17, 2019 June 10, 2019 June 3, 2019 May 28, 2019 May 20, 2019 May 13, 2019 May 6, 2019 April 29, 2019 April 22, 2019 April 15, 2019 April 8, 2019 April 1, 2019 March 25, 2019 March 18, 2019 (canceled due to HOW 2019) March 11, 2019 March 4, 2019 February 25, 2019 February 19, 2019 February 11, 2019 February 4, 2019 January 28, 2019 (canceled due to F2F meeting) January 22, 2019 January 14, 2019 January 7, 2019 December 31, 2018 (canceled) December 24, 2018 (canceled) December 17, 2018 December 10, 2018 December 3, 2018 November 26, 2018 November 19, 2018 November 13, 2018 November 5, 2018 (canceled) October 29, 2018 (canceled) October 22, 2018 (canceled) October 15, 2018 October 8, 2018 October 1, 2018 September 24, 2018 September 17, 2018 September 10, 2018 September 4, 2018 August 27, 2018 August 20, 2018 August 13, 2018 August 6, 2018 Archived Meeting Minutes For archived meeting minutes, see the GitHub repository","title":"Home"},{"location":"#osg-operations","text":"Welcome to the home page of the OSG Operations Team documentation area!","title":"OSG Operations"},{"location":"#mission","text":"The mission of OSG Operations is to maintain and improve distributed high throughput computing services to support research communities. This is accomplished by: Operating and maintaining our services in a user-oriented, robust, and reliable manner. Developing a professional and skilled staff dedicated to a service philosophy. Managing resources responsibly, efficiently, and with accountability. Evaluating and continually improving the actions, methods and processes that allow the OSG to operate.","title":"Mission"},{"location":"#contact-us","text":"Open a Ticket Slack channel - if you can't create an account, send an e-mail to help@opensciencegrid.org Email: help@opensciencegrid.org","title":"Contact Us"},{"location":"#registration-contact-resource-vo-or-project","text":"Register with OSG","title":"Registration (Contact, Resource, VO, or Project)"},{"location":"#weekly-operations-meetings","text":"When: Fridays 12:30 pm Central URL: https://unl.zoom.us/j/183382852 Phone: +1 669 900 6833 or +1 408 638 0968 or +1 646 876 9923 Meeting ID: 183 382 852 (password required; available on request)","title":"Weekly Operations Meetings"},{"location":"#meeting-minutes","text":"September 6, 2024 August 23, 2024 August 16, 2024 August 9, 2024 August 2, 2024 July 26, 2024 July 19, 2024 July 5, 2024 June 28, 2024 June 21, 2024 June 14, 2024 June 7, 2024 May 31, 2024 May 24, 2024 May 17, 2024 May 10, 2024 May 3, 2024 April 26, 2024 April 19, 2024 April 12, 2024 April 5, 2024 March 29, 2024 (canceled) March 22, 2024 March 15, 2024 March 8, 2024 March 1, 2024 February 23, 2024 February 16, 2024 February 9, 2024 February 2, 2024 January 26, 2024 January 19, 2024 January 12, 2024 January 5, 2024 December 29, 2023 (canceled) December 22, 2023 (canceled) December 15, 2023 December 8, 2023 December 1, 2023 November 24, 2023 (canceled) November 17, 2023 November 10, 2023 November 3, 2023 October 27, 2023 October 20, 2023 October 13, 2023 October 6, 2023 September 29, 2023 September 22, 2023 September 15, 2023 September 8, 2023 September 1, 2023 August 25, 2023 August 18, 2023 August 11, 2023 August 4, 2023 July 28, 2023 July 21, 2023 January 14, 2023 (canceled due to Throughput Computing 23) July 7, 2023 June 30, 2023 June 23, 2023 June 16, 2023 June 9, 2023 June 2, 2023 May 26, 2023 May 19, 2023 May 12, 2023 May 5, 2023 April 28, 2023 April 21, 2023 April 14, 2023 April 7, 2023 March 31, 2023 March 24, 2023 March 17, 2023 March 10, 2023 March 3, 2023 February 24, 2023 February 17, 2023 February 10, 2023 February 3, 2023 January 27, 2023 January 20, 2023 January 13, 2023 January 6, 2023 (canceled) December 30, 2022 (canceled) December 23, 2022 (canceled) December 16, 2022 December 9, 2022 December 2, 2022 November 25, 2022 (canceled) November 18, 2022 November 11, 2022 November 4, 2022 October 28, 2022 October 21, 2022 October 14, 2022 October 7, 2022 September 30, 2022 September 23, 2022 (canceled) September 16, 2022 (canceled) September 9, 2022 September 2, 2022 August 26, 2022 August 19, 2022 August 12, 2022 August 5, 2022 (canceled) July 29, 2022 July 22, 2022 (canceled) July 15, 2022 July 8, 2022 July 1, 2022 June 24, 2022 June 17, 2022 June 10, 2022 June 3, 2022 May 27, 2022 May 20, 2022 May 13, 2022 May 6, 2022 (canceled) April 29, 2022 April 22, 2022 April 15, 2022 April 8, 2022 April 1, 2022 March 25, 2022 March 18, 2022 (canceled) March 11, 2022 March 4, 2022 February 25, 2022 February 18, 2022 February 11, 2022 February 4, 2022 January 28, 2022 January 21, 2022 January 14, 2022 January 7, 2022 December 31, 2021 (canceled) December 24, 2021 (canceled) December 17, 2021 December 10, 2021 December 3, 2021 November 26, 2021 (canceled) November 19, 2021 November 12, 2021 November 5, 2021 October 29, 2021 October 22, 2021 October 15, 2021 (canceled) October 8, 2021 October 1, 2021 September 24, 2021 September 17, 2021 September 10, 2021 September 3, 2021 August 27, 2021 August 20, 2021 August 13, 2021 August 6, 2021 July 30, 2021 July 23, 2021 July 16, 2021 July 9, 2021 July 2, 2021 June 25, 2021 June 18, 2021 June 11, 2021 June 4, 2021 May 28, 2021 May 21, 2021 May 14, 2021 May 7, 2021 April 30, 2021 April 23, 2021 April 16, 2021 April 9, 2021 April 2, 2021 March 26, 2021 March 19, 2021 March 12, 2021 March 5, 2021 (canceled) February 26, 2021 February 19, 2021 February 12, 2021 February 5, 2021 January 29, 2021 January 22, 2021 January 15, 2021 January 8, 2021 January 1, 2021 (canceled) December 25, 2020 (canceled) December 18, 2020 December 11, 2020 December 4, 2020 November 20, 2020 November 13, 2020 November 6, 2020 October 30, 2020 October 23, 2020 October 16, 2020 October 9, 2020 October 2, 2020 September 25, 2020 September 18, 2020 September 11, 2020 September 4, 2020 (canceled) August 28, 2020 August 21, 2020 August 14, 2020 August 7, 2020 July 31, 2020 July 24, 2020 July 17, 2020 July 10, 2020 July 3, 2020 (canceled) June 26, 2020 June 19, 2020 June 12, 2020 June 5, 2020 May 29, 2020 (canceled) May 22, 2020 May 15, 2020 May 8, 2020 May 1, 2020 April 24, 2020 April 17, 2020 April 10, 2020 April 3, 2020 March 27, 2020 March 20, 2020 March 13, 2020 March 6, 2020 February 28, 2020 February 21, 2020 February 14, 2020 February 7, 2020 January 31, 2020 January 24, 2020 January 17, 2020 January 10, 2020 January 3, 2020 December 27, 2019 December 20, 2019 December 13, 2019 December 6, 2019 November 29, 2019 (canceled) November 22, 2019 November 15, 2019 November 8, 2019 November 1, 2019 October 25, 2019 October 18, 2019 October 11, 2019 October 4, 2019 September 27, 2019 September 20, 2019 September 13, 2019 September 6, 2019 August 30, 2019 August 23, 2019 August 16, 2019 August 9, 2019 August 2, 2019 July 26, 2019 July 19, 2019 July 12, 2019 July 8, 2019 July 1, 2019 June 24, 2019 June 17, 2019 June 10, 2019 June 3, 2019 May 28, 2019 May 20, 2019 May 13, 2019 May 6, 2019 April 29, 2019 April 22, 2019 April 15, 2019 April 8, 2019 April 1, 2019 March 25, 2019 March 18, 2019 (canceled due to HOW 2019) March 11, 2019 March 4, 2019 February 25, 2019 February 19, 2019 February 11, 2019 February 4, 2019 January 28, 2019 (canceled due to F2F meeting) January 22, 2019 January 14, 2019 January 7, 2019 December 31, 2018 (canceled) December 24, 2018 (canceled) December 17, 2018 December 10, 2018 December 3, 2018 November 26, 2018 November 19, 2018 November 13, 2018 November 5, 2018 (canceled) October 29, 2018 (canceled) October 22, 2018 (canceled) October 15, 2018 October 8, 2018 October 1, 2018 September 24, 2018 September 17, 2018 September 10, 2018 September 4, 2018 August 27, 2018 August 20, 2018 August 13, 2018 August 6, 2018","title":"Meeting Minutes"},{"location":"#archived-meeting-minutes","text":"For archived meeting minutes, see the GitHub repository","title":"Archived Meeting Minutes"},{"location":"external-oasis-repos/","text":"External OASIS Repositories We offer hosting of non-OSG CVMFS repositories on OASIS. This means that requests to create, rename, remove, or blanking OASIS repositories will come in as GOC tickets. This document contains instructions for handling those tickets. Also see Policy for OSG Mirroring of External CVMFS repositories External OASIS repository Requests to Host a Repository on OASIS Ensure that the repository administrator is valid for the VO. This can be done by (a) OSG already having a relationship with the person or (b) the contacting the VO manager to find out. Also, the person should be listed in the OSG topology contacts list . Review provided URL and verify that it is appropriate for the VO and no other project uses it already. In order to make sure the name in URL is appropriate, check that the name is derived from the VO name or one of its projects. Then, add the repository URL to the topology for given VO under the OASISRepoURLs . This should cause the repository's configuration to be added to the OSG Stratum-0 within 15 minutes after URL is added into the topology. For example, if new URL is for the VO DUNE http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org edit the following under the OASIS section and create PR: git clone git://github.com/opensciencegrid/topology.git vim topology/virtual-organizations/DUNE.yaml ... OASIS: OASISRepoURLs: - http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org/ ... When the PR is approved, check on the oasis.opensciencegrid.org host whether the new repository was successfuly signed. There should be message about it in the log file /var/log/oasis/generate_whitelists.log : Tue Sep 25 17:34:02 2018 Running add_osg_repository http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org dune.osgstorage.org: Signing 7 day whitelist with masterkeycard... done If the respository ends in a new domain name that has not been distributed before, a new domain key will be needed on oasis-replica which should get automatically downloaded from the etc/cvmfs/keys directory in the master branch of the config-repo github repository . There should be a message about downloading it in the log file /var/log/cvmfs/generate_replicas.log . After the key is downloaded the repository should also be automatically added, with messages in the same log file. After the repository is successfully on oasis-replica, in addition you need to update the OSG configuration repository. Make changes in a workspace cloned from the config-repo github repository and use the osg branch (or a branch made from it) in a personal account on oasis-itb . Add a domain configuration in etc/cvmfs/domain.d that's a lot like one of the other imported domains, for example egi.eu.conf . The server urls might be slightly different; use the URLs of the stratum 1s where it is already hosted if there are any, and you can add at least the FNAL and BNL stratum 1s. Copy key(s) for the domain into etc/cvmfs/keys from the master branch, either a single .pub file or a directory, whichever the master branch has. Test all these changes out on the config-osg.opensciencegrid.org repository on oasis-itb using the copy_config_osg command, and configure a test client to read from oasis-itb.opensciencegrid.org instead of oasis.opensciencegrid.org . Then commit those changes into a new branch you made from the osg branch, and make a pull request. Once that PR is approved and merged, log in to the oasis machine and run copy_config_osg as root there to copy from github to the production configuration repository on the oasis machine. If the repository name does not match *.opensciencegrid.org or *.osgstorage.org , skip this step and go on to your next step. If it does match one of those two patterns, then respond to the ticket to tell the administrator to continue with their next step (their step 4). We don't want them to continue before 15 minutes has elapsed after step 2 above, so either wait that much time or tell them the time they may proceed (15 minutes after you updated topology). Then wait until the admin has updated the ticket to indicate that they have completed their step before moving on. Ask the administrator of the BNL stratum 1 (John De Stefano) to also add the new repository. The BNL Stratum-1 administrator should set the service to read from http://oasis-replica.opensciencegrid.org:8002/cvmfs/<EXAMPLE.OPENSCIENCEGRID.ORG> . When the BNL Stratum-1 administrator has reported back that the replication is ready, respond to the requester that the repository is fully replicated on the OSG and close the ticket. Requests to Change the URL of an External Repository If there is a request to change the URL of an external repository, update the registered value in OASISRepoURLs for the respective VO in the topology. Tell the requester that it is ready 15 minutes after topology is updated. Requests to Remove an External Repository After validating that the ticket submitter is authorized by the VO's OASIS manager, delete the registered value for in topology for the VO in OASIS Repo URLs. Verify that it is removed by running the following on any oasis machine to make sure it is missing from the list: print_osg_repos|grep <EXAMPLE.OPENSCIENCEGRID.ORG> Check if the repository has been replicated to RAL by looking in their repositories.json . The user documentation requests the user to make a GGUS ticket to do this, so either ask them to do it or do it yourself. Add the BNL Stratum-1 operator (John De Stefano) to the ticket and ask him to remove the repository. Wait for him to finish before proceeding. Add the FNAL Stratum-1 operators (Merina Albert, Hyun Woo Kim) to the ticket and ask them when they can be ready to delete the repository. They can't remove it before it is removed from oasis-replica because their Stratum-1 automatically adds all repositories oasis-replica has. However, it has to be done within 8 hours of removal on oasis-replica or an alarm will start going off. Run the following command on oasis , oasis-itb , oasis-replica and oasis-replica-itb : remove_osg_repository -f <EXAMPLE.OPENSCIENCEGRID.ORG> Tell the FNAL Stratum-1 operators to go ahead and remove the repository. Response to Security Incident on an External Repository If there is a security incident on the publishing machine of an external repository and a publishing key is compromised, the fingerprint of that key should be added to /cvmfs/config-osg.opensciencegrid.org/etc/cvmfs/blacklist . In addition, another line should be added in the form <repository.name NNN with the repository name and a revision number that's one higher than the currently published revision. For more details see the cvmfs documentation on blacklisting .","title":"External OASIS repositories"},{"location":"external-oasis-repos/#external-oasis-repositories","text":"We offer hosting of non-OSG CVMFS repositories on OASIS. This means that requests to create, rename, remove, or blanking OASIS repositories will come in as GOC tickets. This document contains instructions for handling those tickets. Also see Policy for OSG Mirroring of External CVMFS repositories External OASIS repository","title":"External OASIS Repositories"},{"location":"external-oasis-repos/#requests-to-host-a-repository-on-oasis","text":"Ensure that the repository administrator is valid for the VO. This can be done by (a) OSG already having a relationship with the person or (b) the contacting the VO manager to find out. Also, the person should be listed in the OSG topology contacts list . Review provided URL and verify that it is appropriate for the VO and no other project uses it already. In order to make sure the name in URL is appropriate, check that the name is derived from the VO name or one of its projects. Then, add the repository URL to the topology for given VO under the OASISRepoURLs . This should cause the repository's configuration to be added to the OSG Stratum-0 within 15 minutes after URL is added into the topology. For example, if new URL is for the VO DUNE http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org edit the following under the OASIS section and create PR: git clone git://github.com/opensciencegrid/topology.git vim topology/virtual-organizations/DUNE.yaml ... OASIS: OASISRepoURLs: - http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org/ ... When the PR is approved, check on the oasis.opensciencegrid.org host whether the new repository was successfuly signed. There should be message about it in the log file /var/log/oasis/generate_whitelists.log : Tue Sep 25 17:34:02 2018 Running add_osg_repository http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org dune.osgstorage.org: Signing 7 day whitelist with masterkeycard... done If the respository ends in a new domain name that has not been distributed before, a new domain key will be needed on oasis-replica which should get automatically downloaded from the etc/cvmfs/keys directory in the master branch of the config-repo github repository . There should be a message about downloading it in the log file /var/log/cvmfs/generate_replicas.log . After the key is downloaded the repository should also be automatically added, with messages in the same log file. After the repository is successfully on oasis-replica, in addition you need to update the OSG configuration repository. Make changes in a workspace cloned from the config-repo github repository and use the osg branch (or a branch made from it) in a personal account on oasis-itb . Add a domain configuration in etc/cvmfs/domain.d that's a lot like one of the other imported domains, for example egi.eu.conf . The server urls might be slightly different; use the URLs of the stratum 1s where it is already hosted if there are any, and you can add at least the FNAL and BNL stratum 1s. Copy key(s) for the domain into etc/cvmfs/keys from the master branch, either a single .pub file or a directory, whichever the master branch has. Test all these changes out on the config-osg.opensciencegrid.org repository on oasis-itb using the copy_config_osg command, and configure a test client to read from oasis-itb.opensciencegrid.org instead of oasis.opensciencegrid.org . Then commit those changes into a new branch you made from the osg branch, and make a pull request. Once that PR is approved and merged, log in to the oasis machine and run copy_config_osg as root there to copy from github to the production configuration repository on the oasis machine. If the repository name does not match *.opensciencegrid.org or *.osgstorage.org , skip this step and go on to your next step. If it does match one of those two patterns, then respond to the ticket to tell the administrator to continue with their next step (their step 4). We don't want them to continue before 15 minutes has elapsed after step 2 above, so either wait that much time or tell them the time they may proceed (15 minutes after you updated topology). Then wait until the admin has updated the ticket to indicate that they have completed their step before moving on. Ask the administrator of the BNL stratum 1 (John De Stefano) to also add the new repository. The BNL Stratum-1 administrator should set the service to read from http://oasis-replica.opensciencegrid.org:8002/cvmfs/<EXAMPLE.OPENSCIENCEGRID.ORG> . When the BNL Stratum-1 administrator has reported back that the replication is ready, respond to the requester that the repository is fully replicated on the OSG and close the ticket.","title":"Requests to Host a Repository on OASIS"},{"location":"external-oasis-repos/#requests-to-change-the-url-of-an-external-repository","text":"If there is a request to change the URL of an external repository, update the registered value in OASISRepoURLs for the respective VO in the topology. Tell the requester that it is ready 15 minutes after topology is updated.","title":"Requests to Change the URL of an External Repository"},{"location":"external-oasis-repos/#requests-to-remove-an-external-repository","text":"After validating that the ticket submitter is authorized by the VO's OASIS manager, delete the registered value for in topology for the VO in OASIS Repo URLs. Verify that it is removed by running the following on any oasis machine to make sure it is missing from the list: print_osg_repos|grep <EXAMPLE.OPENSCIENCEGRID.ORG> Check if the repository has been replicated to RAL by looking in their repositories.json . The user documentation requests the user to make a GGUS ticket to do this, so either ask them to do it or do it yourself. Add the BNL Stratum-1 operator (John De Stefano) to the ticket and ask him to remove the repository. Wait for him to finish before proceeding. Add the FNAL Stratum-1 operators (Merina Albert, Hyun Woo Kim) to the ticket and ask them when they can be ready to delete the repository. They can't remove it before it is removed from oasis-replica because their Stratum-1 automatically adds all repositories oasis-replica has. However, it has to be done within 8 hours of removal on oasis-replica or an alarm will start going off. Run the following command on oasis , oasis-itb , oasis-replica and oasis-replica-itb : remove_osg_repository -f <EXAMPLE.OPENSCIENCEGRID.ORG> Tell the FNAL Stratum-1 operators to go ahead and remove the repository.","title":"Requests to Remove an External Repository"},{"location":"external-oasis-repos/#response-to-security-incident-on-an-external-repository","text":"If there is a security incident on the publishing machine of an external repository and a publishing key is compromised, the fingerprint of that key should be added to /cvmfs/config-osg.opensciencegrid.org/etc/cvmfs/blacklist . In addition, another line should be added in the form <repository.name NNN with the repository name and a revision number that's one higher than the currently published revision. For more details see the cvmfs documentation on blacklisting .","title":"Response to Security Incident on an External Repository"},{"location":"SLA/access-point/","text":"Access Point Service Level Agreement Service Name(s) Access Point Description The Access Point is a HTCondor-based service that runs a submit host, and manages the queue of user jobs submitted to a HTCondor pool. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The Schedd is usually on a host that requires interactive logins for users. Access is generally provided via SSH. The Schedd authenticates with other daemons in the HTCondor pool with either GSI or IDTOKENS. Service Availability Availability Definition condor_q returns successfully The condor_schedd process is able to post a SchedD ad into its primary collector Target Availability: 95%","title":"Access Point"},{"location":"SLA/access-point/#access-point-service-level-agreement","text":"","title":"Access Point Service Level Agreement"},{"location":"SLA/access-point/#service-names","text":"Access Point","title":"Service Name(s)"},{"location":"SLA/access-point/#description","text":"The Access Point is a HTCondor-based service that runs a submit host, and manages the queue of user jobs submitted to a HTCondor pool.","title":"Description"},{"location":"SLA/access-point/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/access-point/#security-considerations","text":"The Schedd is usually on a host that requires interactive logins for users. Access is generally provided via SSH. The Schedd authenticates with other daemons in the HTCondor pool with either GSI or IDTOKENS.","title":"Security Considerations"},{"location":"SLA/access-point/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/access-point/#availability-definition","text":"condor_q returns successfully The condor_schedd process is able to post a SchedD ad into its primary collector","title":"Availability Definition"},{"location":"SLA/access-point/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/collector/","text":"CE Collector Service Level Agreement Service Name(s) CE Collector Description HTCondor Collector that advertises information about all of the HTCondor Compute Elements available for use on the OSG. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Effect of compromise of this service are limited to the service. A compromise may make the service unusable by clients trying to query it. Service Availability Availability Definition condor_status queries are successful Target Availability: 90%","title":"CE Collector"},{"location":"SLA/collector/#ce-collector-service-level-agreement","text":"","title":"CE Collector Service Level Agreement"},{"location":"SLA/collector/#service-names","text":"CE Collector","title":"Service Name(s)"},{"location":"SLA/collector/#description","text":"HTCondor Collector that advertises information about all of the HTCondor Compute Elements available for use on the OSG.","title":"Description"},{"location":"SLA/collector/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/collector/#security-considerations","text":"Effect of compromise of this service are limited to the service. A compromise may make the service unusable by clients trying to query it.","title":"Security Considerations"},{"location":"SLA/collector/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/collector/#availability-definition","text":"condor_status queries are successful","title":"Availability Definition"},{"location":"SLA/collector/#target-availability-90","text":"","title":"Target Availability: 90%"},{"location":"SLA/general/","text":"General Service Level Agreement Executive Summary This service level agreement (SLA) is between OSG and its Stakeholders, and applies to all centrally provided OSG production services. A specific SLA exists for each service type; it will list specific services covered by this type, provide a brief description, link to this document, provide specific security considerations, and define service-specific availability metrics. Owners This SLA is owned by OSG Operations and will be reviewed and agreed upon by the OSG Executive Team and service Stakeholders. The ultimate responsibility for abiding to the SLA lies with the OSG Executive Director and Technical Director. Service Target Response Priorities and Response Times This section deals with unplanned outages. Please see Planned Service Changes for information on planned maintenance outages. Severity Description Response Time Resolution Time Escalation Rate High The issue prevents any use of the service Within 4 business hours 2 business days 2 business days Elevated The issue prevents some acceptable uses Within 2 business days 5 business days 5 business days Normal The issue causes degraded performance Within 3 business days 30 business days 30 business days Services generally operate all the time, but support will be within business hours (9 a.m. to 5 p.m.) in the timezone of the host institution. The exception is for security incidents. Escalation Procedure Once a service can not be restored within the SLA Resolution Time specified, it automatically escalates. Subsequent levels escalate every period as defined in the Escalation rate in the response table above. Any issue starts at escalation level 1. Escalation Level OSG Contact 1st Service Owner 2nd OSG Operations Coordinator 3rd OSG Technical Director and Executive Director Any ongoing issues will be discussed at the weekly Operations and Production meetings. \"High\" and possibly \"Elevated\" level issues will in addition require more frequent meetings of the appropriate set of people to resolve the issue in a timely manner. Service Availability and Outages The Operations team will strive for monthly availability target percentages as declared in service specific SLAs, where availability is also defined per service. If service availability falls below monthly targets as monitored on two consecutive months, a root cause analysis and service plan will be submitted to the OSG Executive Team. This plan specifies actions to restore the service to the level of availability as specified in the SLA. Off-Hours Support Procedures OSG is presently not structured to commit to provide off-hours support. Accordingly, any off-hours support that may be provided is based on staff going beyond their agreed-upon responsibilities. Planned Service Changes The Operations team reserves the right to change the service based on its needs. No change will occur without notifying Stakeholders at least 5 business days in advance via the operations meeting and the operations email list. This includes both downtimes, as well as changes in functionality that do not substantially change the nature of the service. More substantial changes require more planning and longer lead times. Requests for Service Enhancements Stakeholders may request service enhancements via standard ticketing procedures. It is up to the Operations team to assess the impact of the requested changes and assign appropriate planning and review. Customer Problem Reporting Service problems should be reported immediately by opening a ticket: Either directly at https://support.opensciencegrid.org Or by emailing a description to help@opensciencegrid.org Responsibilities Customer Responsibilities Service customers agree to: Use the service as intended and only for OSG approved work. Alert the Operations team if they are going to use the service in a non-standard way, including testing or anticipated significant increases in usage. Contact the Operations team by means outlined in the Customer Problem Reporting section above if they encounter any service issues. Be willing and available to provide information in a timely manner consistent with the promised Resolution Times listed above. OSG Responsibilities Create and add appropriate documentation for appropriate use of the service. Meet response times associated with the priority assigned to Customer issues. Maintain appropriately trained staff. The OSG and Operations team are not responsible to meet target Resolution Times if a customer does not provide sufficient feedback. Log and track all customer requests for service through the OSG ticketing system. Announce planned changes to stakeholders in accordance with the Planned Service Changes section above and try to minimize adverse effects on stakeholders. SLA Change Procedure This SLA will remain valid unless a change or update is requested by the OSG Operations Coordinator, the OSG Executive Team, or Stakeholders. Any disagreements between OSG Executive Team and Stakeholders about desired changes or implementation of these SLAs will get resolved via the OSG Council.","title":"General"},{"location":"SLA/general/#general-service-level-agreement","text":"","title":"General Service Level Agreement"},{"location":"SLA/general/#executive-summary","text":"This service level agreement (SLA) is between OSG and its Stakeholders, and applies to all centrally provided OSG production services. A specific SLA exists for each service type; it will list specific services covered by this type, provide a brief description, link to this document, provide specific security considerations, and define service-specific availability metrics.","title":"Executive Summary"},{"location":"SLA/general/#owners","text":"This SLA is owned by OSG Operations and will be reviewed and agreed upon by the OSG Executive Team and service Stakeholders. The ultimate responsibility for abiding to the SLA lies with the OSG Executive Director and Technical Director.","title":"Owners"},{"location":"SLA/general/#service-target-response-priorities-and-response-times","text":"This section deals with unplanned outages. Please see Planned Service Changes for information on planned maintenance outages. Severity Description Response Time Resolution Time Escalation Rate High The issue prevents any use of the service Within 4 business hours 2 business days 2 business days Elevated The issue prevents some acceptable uses Within 2 business days 5 business days 5 business days Normal The issue causes degraded performance Within 3 business days 30 business days 30 business days Services generally operate all the time, but support will be within business hours (9 a.m. to 5 p.m.) in the timezone of the host institution. The exception is for security incidents.","title":"Service Target Response Priorities and Response Times"},{"location":"SLA/general/#escalation-procedure","text":"Once a service can not be restored within the SLA Resolution Time specified, it automatically escalates. Subsequent levels escalate every period as defined in the Escalation rate in the response table above. Any issue starts at escalation level 1. Escalation Level OSG Contact 1st Service Owner 2nd OSG Operations Coordinator 3rd OSG Technical Director and Executive Director Any ongoing issues will be discussed at the weekly Operations and Production meetings. \"High\" and possibly \"Elevated\" level issues will in addition require more frequent meetings of the appropriate set of people to resolve the issue in a timely manner.","title":"Escalation Procedure"},{"location":"SLA/general/#service-availability-and-outages","text":"The Operations team will strive for monthly availability target percentages as declared in service specific SLAs, where availability is also defined per service. If service availability falls below monthly targets as monitored on two consecutive months, a root cause analysis and service plan will be submitted to the OSG Executive Team. This plan specifies actions to restore the service to the level of availability as specified in the SLA.","title":"Service Availability and Outages"},{"location":"SLA/general/#off-hours-support-procedures","text":"OSG is presently not structured to commit to provide off-hours support. Accordingly, any off-hours support that may be provided is based on staff going beyond their agreed-upon responsibilities.","title":"Off-Hours Support Procedures"},{"location":"SLA/general/#planned-service-changes","text":"The Operations team reserves the right to change the service based on its needs. No change will occur without notifying Stakeholders at least 5 business days in advance via the operations meeting and the operations email list. This includes both downtimes, as well as changes in functionality that do not substantially change the nature of the service. More substantial changes require more planning and longer lead times.","title":"Planned Service Changes"},{"location":"SLA/general/#requests-for-service-enhancements","text":"Stakeholders may request service enhancements via standard ticketing procedures. It is up to the Operations team to assess the impact of the requested changes and assign appropriate planning and review.","title":"Requests for Service Enhancements"},{"location":"SLA/general/#customer-problem-reporting","text":"Service problems should be reported immediately by opening a ticket: Either directly at https://support.opensciencegrid.org Or by emailing a description to help@opensciencegrid.org","title":"Customer Problem Reporting"},{"location":"SLA/general/#responsibilities","text":"","title":"Responsibilities"},{"location":"SLA/general/#customer-responsibilities","text":"Service customers agree to: Use the service as intended and only for OSG approved work. Alert the Operations team if they are going to use the service in a non-standard way, including testing or anticipated significant increases in usage. Contact the Operations team by means outlined in the Customer Problem Reporting section above if they encounter any service issues. Be willing and available to provide information in a timely manner consistent with the promised Resolution Times listed above.","title":"Customer Responsibilities"},{"location":"SLA/general/#osg-responsibilities","text":"Create and add appropriate documentation for appropriate use of the service. Meet response times associated with the priority assigned to Customer issues. Maintain appropriately trained staff. The OSG and Operations team are not responsible to meet target Resolution Times if a customer does not provide sufficient feedback. Log and track all customer requests for service through the OSG ticketing system. Announce planned changes to stakeholders in accordance with the Planned Service Changes section above and try to minimize adverse effects on stakeholders.","title":"OSG Responsibilities"},{"location":"SLA/general/#sla-change-procedure","text":"This SLA will remain valid unless a change or update is requested by the OSG Operations Coordinator, the OSG Executive Team, or Stakeholders. Any disagreements between OSG Executive Team and Stakeholders about desired changes or implementation of these SLAs will get resolved via the OSG Council.","title":"SLA Change Procedure"},{"location":"SLA/gracc/","text":"GRACC Service Level Agreement Service Name(s) GRACC Frontend, GRACC Data Nodes, GRACC-APEL accounting Description GRACC is a large database of usage information for the OSG. GRACC includes the collector which receives the raw usage information from multiple sources, the database which stores, ingests, and retrieves the usage, and the Grafana frontend which visualizes the usage. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations GRACC uses several open source components such as ElasticSearch, Grafana, and Kibana. Each of these components publish security bulletins. Usernames / passwords are maintained within Grafana to edit the visualization. Additionally, the data in ElasticSearch is publically available read-only from the web. Service Availability Availability Definition Queries of various elasticsearch and gracc service statuses return active, web pages are accessible, certificates are valid. Target Availability: 95%","title":"GRACC"},{"location":"SLA/gracc/#gracc-service-level-agreement","text":"","title":"GRACC Service Level Agreement"},{"location":"SLA/gracc/#service-names","text":"GRACC Frontend, GRACC Data Nodes, GRACC-APEL accounting","title":"Service Name(s)"},{"location":"SLA/gracc/#description","text":"GRACC is a large database of usage information for the OSG. GRACC includes the collector which receives the raw usage information from multiple sources, the database which stores, ingests, and retrieves the usage, and the Grafana frontend which visualizes the usage.","title":"Description"},{"location":"SLA/gracc/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gracc/#security-considerations","text":"GRACC uses several open source components such as ElasticSearch, Grafana, and Kibana. Each of these components publish security bulletins. Usernames / passwords are maintained within Grafana to edit the visualization. Additionally, the data in ElasticSearch is publically available read-only from the web.","title":"Security Considerations"},{"location":"SLA/gracc/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gracc/#availability-definition","text":"Queries of various elasticsearch and gracc service statuses return active, web pages are accessible, certificates are valid.","title":"Availability Definition"},{"location":"SLA/gracc/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/gwms-factory/","text":"GWMS Factory Service Level Agreement Service Name(s) GWMS Factory Description The GlideinWMS Factory is responsible for serving GWMS Frontend requests and submitting pilots to compute sites on behalf of the respective science communities each Frontend supports based on user demand. Pilots are most typically submitted to sites through their respective Compute Entrypoints. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Frontends register to the Factory using HTCondor supported security mechanisms. The current supported authorization and authentication method is GSI based. A registered frontend is granted access to write requests in the form of HTCondor Classads to the Factory Collector. The factory receives submit credentials from the frontend through encrypted HTCondor classads; the factory can impersonate the identities from the frontend. The factory submits on a Frontend\u2019s behalf to Compute Entrypoints using either GSI or SciTokens. Service Availability Availability Definition Querying factory collector successfully returns, LastHeardFrom ad is < 6h from now Target Availability: 95%","title":"GWMS Factory"},{"location":"SLA/gwms-factory/#gwms-factory-service-level-agreement","text":"","title":"GWMS Factory Service Level Agreement"},{"location":"SLA/gwms-factory/#service-names","text":"GWMS Factory","title":"Service Name(s)"},{"location":"SLA/gwms-factory/#description","text":"The GlideinWMS Factory is responsible for serving GWMS Frontend requests and submitting pilots to compute sites on behalf of the respective science communities each Frontend supports based on user demand. Pilots are most typically submitted to sites through their respective Compute Entrypoints.","title":"Description"},{"location":"SLA/gwms-factory/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gwms-factory/#security-considerations","text":"Frontends register to the Factory using HTCondor supported security mechanisms. The current supported authorization and authentication method is GSI based. A registered frontend is granted access to write requests in the form of HTCondor Classads to the Factory Collector. The factory receives submit credentials from the frontend through encrypted HTCondor classads; the factory can impersonate the identities from the frontend. The factory submits on a Frontend\u2019s behalf to Compute Entrypoints using either GSI or SciTokens.","title":"Security Considerations"},{"location":"SLA/gwms-factory/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gwms-factory/#availability-definition","text":"Querying factory collector successfully returns, LastHeardFrom ad is < 6h from now","title":"Availability Definition"},{"location":"SLA/gwms-factory/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/gwms-frontend/","text":"GWMS Frontend Service Level Agreement Service Name(s) GWMS Frontend Description The GlideinWMS Frontend is responsible for monitoring a GWMS HTCondor pool, and submits pilot submission requests to the factory to compute sites based on user demand. There is usually one frontend per HTCondor Pool for a given scientific community. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The frontend generates credentials required to talk to the factory, manage the HTCondor pool, and submit to Compute Entrypoints. These can either be GSI, or IDTOKENS for pool credentials, and SciTokens for CE submission credentials. The Frontend forwards CE and pilot credentials to the factory by sending encrypted classads in the Factory Collector. Service Availability Availability Definition https://HOSTNAME/vofrontend/monitor/frontend_status.xml is accessible and updated time is < 20 min from now Target Availability: 95%","title":"GWMS Frontend"},{"location":"SLA/gwms-frontend/#gwms-frontend-service-level-agreement","text":"","title":"GWMS Frontend Service Level Agreement"},{"location":"SLA/gwms-frontend/#service-names","text":"GWMS Frontend","title":"Service Name(s)"},{"location":"SLA/gwms-frontend/#description","text":"The GlideinWMS Frontend is responsible for monitoring a GWMS HTCondor pool, and submits pilot submission requests to the factory to compute sites based on user demand. There is usually one frontend per HTCondor Pool for a given scientific community.","title":"Description"},{"location":"SLA/gwms-frontend/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gwms-frontend/#security-considerations","text":"The frontend generates credentials required to talk to the factory, manage the HTCondor pool, and submit to Compute Entrypoints. These can either be GSI, or IDTOKENS for pool credentials, and SciTokens for CE submission credentials. The Frontend forwards CE and pilot credentials to the factory by sending encrypted classads in the Factory Collector.","title":"Security Considerations"},{"location":"SLA/gwms-frontend/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gwms-frontend/#availability-definition","text":"https://HOSTNAME/vofrontend/monitor/frontend_status.xml is accessible and updated time is < 20 min from now","title":"Availability Definition"},{"location":"SLA/gwms-frontend/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/hosted-ce/","text":"Hosted CE Service Level Agreement Service Name(s) Hosted CE Description The Hosted CE (Compute Entrypoint) is a door into a computing site. Sites that prefer not to operate their own CE can request OSG staff to operate one on their behalf. Jobs submitted to CEs are typically pilot jobs submitted from systems such as GlideinWMS, which temporarily reserve compute resources at the site for a given scientific community. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The Hosted CE requires an SSH credential in order to access the headnode of a site batch system. The site creates dedicated generic user accounts for each scientific community that submits through the CE. Pilot jobs submitted to the CE are authenticated with either GSI or SciTokens. Service Availability Availability Definition Hostcert is valid The CE can be queried with condor_ce_q CE is reporting to Gratia if there are jobs in condor_history Target Availability: 95%","title":"Hosted CE"},{"location":"SLA/hosted-ce/#hosted-ce-service-level-agreement","text":"","title":"Hosted CE Service Level Agreement"},{"location":"SLA/hosted-ce/#service-names","text":"Hosted CE","title":"Service Name(s)"},{"location":"SLA/hosted-ce/#description","text":"The Hosted CE (Compute Entrypoint) is a door into a computing site. Sites that prefer not to operate their own CE can request OSG staff to operate one on their behalf. Jobs submitted to CEs are typically pilot jobs submitted from systems such as GlideinWMS, which temporarily reserve compute resources at the site for a given scientific community.","title":"Description"},{"location":"SLA/hosted-ce/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/hosted-ce/#security-considerations","text":"The Hosted CE requires an SSH credential in order to access the headnode of a site batch system. The site creates dedicated generic user accounts for each scientific community that submits through the CE. Pilot jobs submitted to the CE are authenticated with either GSI or SciTokens.","title":"Security Considerations"},{"location":"SLA/hosted-ce/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/hosted-ce/#availability-definition","text":"Hostcert is valid The CE can be queried with condor_ce_q CE is reporting to Gratia if there are jobs in condor_history","title":"Availability Definition"},{"location":"SLA/hosted-ce/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/htcss-central-manager/","text":"HTCSS Pool Central Manager Service Level Agreement Service Name(s) Central Manager Description The Central Manager is the HTCondor service that represents a HTCondor pool. It is the central database of available resources and users who submit to them. Submit hosts communicate user information to the Collector from the VO Schedds. When pilots or containers claim resources, they connect to the collector to describe the claimed machine. HTCondor can then match user jobs to resources using the normal matchmaking mechanisms. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Schedds and pilots are authenticated and authorized to write ads to the collector using GSI or IDTOKENS. Service Availability Availability Definition The collector responds to condor_status queries The negotiator process posts submitter priority summaries in the collector Target Availability: 95%","title":"HTCSS Central Manager"},{"location":"SLA/htcss-central-manager/#htcss-pool-central-manager-service-level-agreement","text":"","title":"HTCSS Pool Central Manager Service Level Agreement"},{"location":"SLA/htcss-central-manager/#service-names","text":"Central Manager","title":"Service Name(s)"},{"location":"SLA/htcss-central-manager/#description","text":"The Central Manager is the HTCondor service that represents a HTCondor pool. It is the central database of available resources and users who submit to them. Submit hosts communicate user information to the Collector from the VO Schedds. When pilots or containers claim resources, they connect to the collector to describe the claimed machine. HTCondor can then match user jobs to resources using the normal matchmaking mechanisms.","title":"Description"},{"location":"SLA/htcss-central-manager/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/htcss-central-manager/#security-considerations","text":"Schedds and pilots are authenticated and authorized to write ads to the collector using GSI or IDTOKENS.","title":"Security Considerations"},{"location":"SLA/htcss-central-manager/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/htcss-central-manager/#availability-definition","text":"The collector responds to condor_status queries The negotiator process posts submitter priority summaries in the collector","title":"Availability Definition"},{"location":"SLA/htcss-central-manager/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/message-broker/","text":"Message Broker Service Level Agreement Service Name(s) Message Broker Description This service allows short, text based messages to be exchanged between OSG operated computers. Uses include transport of accounting information to GRACC and network performance metrics to perfsonar. Service subscribers access queues to exchange data using point-to-point or publish and subscribe patterns. See the Wikipedia article on message queueing services for further discussion. The OSG uses a commercial vendor which provides their own SLA . However, their SLA does not cover \"overuse of resources,\" which is a common reason for downtime of the message broker. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All data transmitted to or by the service shall be considered disclosable without condition. Security of the message broker is handled by the commercial vendor. Accounts / passwords are administered by the OSG. Service Availability Availability Definition Messages can be successfully posted and received. Target Availability: 95%","title":"Message Broker"},{"location":"SLA/message-broker/#message-broker-service-level-agreement","text":"","title":"Message Broker Service Level Agreement"},{"location":"SLA/message-broker/#service-names","text":"Message Broker","title":"Service Name(s)"},{"location":"SLA/message-broker/#description","text":"This service allows short, text based messages to be exchanged between OSG operated computers. Uses include transport of accounting information to GRACC and network performance metrics to perfsonar. Service subscribers access queues to exchange data using point-to-point or publish and subscribe patterns. See the Wikipedia article on message queueing services for further discussion. The OSG uses a commercial vendor which provides their own SLA . However, their SLA does not cover \"overuse of resources,\" which is a common reason for downtime of the message broker.","title":"Description"},{"location":"SLA/message-broker/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/message-broker/#security-considerations","text":"All data transmitted to or by the service shall be considered disclosable without condition. Security of the message broker is handled by the commercial vendor. Accounts / passwords are administered by the OSG.","title":"Security Considerations"},{"location":"SLA/message-broker/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/message-broker/#availability-definition","text":"Messages can be successfully posted and received.","title":"Availability Definition"},{"location":"SLA/message-broker/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/oasis/","text":"OASIS Repository Service Level Agreement Service Name(s) OASIS Stratum 0, Oasis Stratum 1, OASIS Login Description The OASIS service provides users with a central location for application software. The content hosted on OASIS can be made available on OSG compute resources. The service consists of three virtual machines, a stratum 0 CERN Virtual Machine File System (CVMFS) server, a stratum 1 replica of the stratum 0 and a node accessible for login by OASIS managers. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The OASIS stratum 0 server and OASIS stratum 1 are accessible only by OSG Operations and Technology group staff. The OASIS interactive node (oasis-login.opensciencegrid.org) is accessible by OSG Operations and Technology group staff and to registered software managers via gsissh. A person becomes a software manager only by explicit approval of OSG staff within the Topology Database. The OASIS stratum 1 is publicly available as read-only. Service Availability Availability Definition CVMFS status returns OK, OASIS stamp status returns OK, CVMFS repo status return OK, service queries return active Target Availability: 95%","title":"OASIS"},{"location":"SLA/oasis/#oasis-repository-service-level-agreement","text":"","title":"OASIS Repository Service Level Agreement"},{"location":"SLA/oasis/#service-names","text":"OASIS Stratum 0, Oasis Stratum 1, OASIS Login","title":"Service Name(s)"},{"location":"SLA/oasis/#description","text":"The OASIS service provides users with a central location for application software. The content hosted on OASIS can be made available on OSG compute resources. The service consists of three virtual machines, a stratum 0 CERN Virtual Machine File System (CVMFS) server, a stratum 1 replica of the stratum 0 and a node accessible for login by OASIS managers.","title":"Description"},{"location":"SLA/oasis/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/oasis/#security-considerations","text":"The OASIS stratum 0 server and OASIS stratum 1 are accessible only by OSG Operations and Technology group staff. The OASIS interactive node (oasis-login.opensciencegrid.org) is accessible by OSG Operations and Technology group staff and to registered software managers via gsissh. A person becomes a software manager only by explicit approval of OSG staff within the Topology Database. The OASIS stratum 1 is publicly available as read-only.","title":"Security Considerations"},{"location":"SLA/oasis/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/oasis/#availability-definition","text":"CVMFS status returns OK, OASIS stamp status returns OK, CVMFS repo status return OK, service queries return active","title":"Availability Definition"},{"location":"SLA/oasis/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-cache/","text":"OSDF Cache Service Level Agreement Service Name(s) ODSF (Open Science Data Federation) Cache Description Clients contact the closest Cache to access data from the OSDF federation. This server caches fetched files to reduce transfer overhead over the WAN General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations When clients use SciTokens to authenticate with the cache, the bearer token is passed to the cache (which may subsequently use it to impersonate the client to communicate with the origin). Service Availability Availability Definition Can successfully read a test file through the Cache server Target Availability: 95%","title":"OSDF Cache"},{"location":"SLA/osdf-cache/#osdf-cache-service-level-agreement","text":"","title":"OSDF Cache Service Level Agreement"},{"location":"SLA/osdf-cache/#service-names","text":"ODSF (Open Science Data Federation) Cache","title":"Service Name(s)"},{"location":"SLA/osdf-cache/#description","text":"Clients contact the closest Cache to access data from the OSDF federation. This server caches fetched files to reduce transfer overhead over the WAN","title":"Description"},{"location":"SLA/osdf-cache/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-cache/#security-considerations","text":"When clients use SciTokens to authenticate with the cache, the bearer token is passed to the cache (which may subsequently use it to impersonate the client to communicate with the origin).","title":"Security Considerations"},{"location":"SLA/osdf-cache/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-cache/#availability-definition","text":"Can successfully read a test file through the Cache server","title":"Availability Definition"},{"location":"SLA/osdf-cache/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-core/","text":"OSDF Core Service Level Agreement Service Name(s) ODSF (Open Science Data Federation) Redirector, ODSF Monitor Collector, ODSF Shoveler Description The Redirector routes cache requests to be served by the respective data origin that contains the requested file. The Monitor is used to collect transfer accounting data. The Shoveler forwards transfer accounting data from the OSG Message Bus to the WLCG Message Bus General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations If the client uses a SCITOKEN to retrieve a file, the cache may contact the redirector with the SCITOKEN to determine the origin that holds the file. The Monitoring Collector receives messages from XRootD caches that may include DNs, though not the actual certificates and not SCITOKENS. The monitoring collector has credentials to talk to the OSG Message Bus. The shoveler has credentials to talk to the OSG Message Bus, and the WLCG Message Bus. Service Availability Availability Definition Redirector: Able to redirect a known file Monitoring Collector: Outgoing queues in message bus is above 1/sec, or prometheus endpoint is responding Shoveler: Message bus queue for ingestion by the shoveler stays beneath 10,000 queued messages Target Availability: 95%","title":"OSDF Core"},{"location":"SLA/osdf-core/#osdf-core-service-level-agreement","text":"","title":"OSDF Core Service Level Agreement"},{"location":"SLA/osdf-core/#service-names","text":"ODSF (Open Science Data Federation) Redirector, ODSF Monitor Collector, ODSF Shoveler","title":"Service Name(s)"},{"location":"SLA/osdf-core/#description","text":"The Redirector routes cache requests to be served by the respective data origin that contains the requested file. The Monitor is used to collect transfer accounting data. The Shoveler forwards transfer accounting data from the OSG Message Bus to the WLCG Message Bus","title":"Description"},{"location":"SLA/osdf-core/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-core/#security-considerations","text":"If the client uses a SCITOKEN to retrieve a file, the cache may contact the redirector with the SCITOKEN to determine the origin that holds the file. The Monitoring Collector receives messages from XRootD caches that may include DNs, though not the actual certificates and not SCITOKENS. The monitoring collector has credentials to talk to the OSG Message Bus. The shoveler has credentials to talk to the OSG Message Bus, and the WLCG Message Bus.","title":"Security Considerations"},{"location":"SLA/osdf-core/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-core/#availability-definition","text":"Redirector: Able to redirect a known file Monitoring Collector: Outgoing queues in message bus is above 1/sec, or prometheus endpoint is responding Shoveler: Message bus queue for ingestion by the shoveler stays beneath 10,000 queued messages","title":"Availability Definition"},{"location":"SLA/osdf-core/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-origin/","text":"OSDF Origin Service Level Agreement Service Name(s) OSDF (Open Science Data Federation) Origin Description Server that holds the data in the OSDF federation for a particular organization General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations When clients use SciTokens to authenticate with the origin, the bearer token is passed to the origin from the cache Service Availability Availability Definition Can successfully read a test file Target Availability: 95%","title":"OSDF Origin"},{"location":"SLA/osdf-origin/#osdf-origin-service-level-agreement","text":"","title":"OSDF Origin Service Level Agreement"},{"location":"SLA/osdf-origin/#service-names","text":"OSDF (Open Science Data Federation) Origin","title":"Service Name(s)"},{"location":"SLA/osdf-origin/#description","text":"Server that holds the data in the OSDF federation for a particular organization","title":"Description"},{"location":"SLA/osdf-origin/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-origin/#security-considerations","text":"When clients use SciTokens to authenticate with the origin, the bearer token is passed to the origin from the cache","title":"Security Considerations"},{"location":"SLA/osdf-origin/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-origin/#availability-definition","text":"Can successfully read a test file","title":"Availability Definition"},{"location":"SLA/osdf-origin/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/perfsonar/","text":"PerfSonar Service Level Agreement Service Name(s) PerfSonar Components Description A preliminary description of the service is available here . The covered components are those responsible for the data pipeline iteslf, which are the PS Collector and PS Config. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All information collected and distributed by PerfSonar is public. Service Availability Availability Definition Hosts are up, U Michigan provided tests run successfully Target Availability: 95%","title":"PerfSonar"},{"location":"SLA/perfsonar/#perfsonar-service-level-agreement","text":"","title":"PerfSonar Service Level Agreement"},{"location":"SLA/perfsonar/#service-names","text":"PerfSonar Components","title":"Service Name(s)"},{"location":"SLA/perfsonar/#description","text":"A preliminary description of the service is available here . The covered components are those responsible for the data pipeline iteslf, which are the PS Collector and PS Config.","title":"Description"},{"location":"SLA/perfsonar/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/perfsonar/#security-considerations","text":"All information collected and distributed by PerfSonar is public.","title":"Security Considerations"},{"location":"SLA/perfsonar/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/perfsonar/#availability-definition","text":"Hosts are up, U Michigan provided tests run successfully","title":"Availability Definition"},{"location":"SLA/perfsonar/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/software-repo/","text":"Software Repository Service Level Agreement Service Name(s) Software Repository, GridCF Repository Description The OSG RPM Software Repository hold files necessary to update the OSG CA distribution and current OSG Production and ITB Middleware stacks. This service consists of an NGINX Server and a mirroring mechanism, based on rsync and mash, both the web server and mirroring are essential to the operation of the Repository. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The repository is read only by web access, and only Operations staff has access to change or modify the Software Repository. Service Availability Availability Definition osg-release rpm is accessible Target Availability: 95%","title":"Software Repository"},{"location":"SLA/software-repo/#software-repository-service-level-agreement","text":"","title":"Software Repository Service Level Agreement"},{"location":"SLA/software-repo/#service-names","text":"Software Repository, GridCF Repository","title":"Service Name(s)"},{"location":"SLA/software-repo/#description","text":"The OSG RPM Software Repository hold files necessary to update the OSG CA distribution and current OSG Production and ITB Middleware stacks. This service consists of an NGINX Server and a mirroring mechanism, based on rsync and mash, both the web server and mirroring are essential to the operation of the Repository.","title":"Description"},{"location":"SLA/software-repo/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/software-repo/#security-considerations","text":"The repository is read only by web access, and only Operations staff has access to change or modify the Software Repository.","title":"Security Considerations"},{"location":"SLA/software-repo/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/software-repo/#availability-definition","text":"osg-release rpm is accessible","title":"Availability Definition"},{"location":"SLA/software-repo/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/topology/","text":"Topology Service Level Agreement Service Name(s) Topology, Topology Web service Description The Topology service holds information about people and resources involved in the OSG. Projects, site resources and downtime information are stored in flat text files in a public GitHub repository. Private data such as contact information is stored in a private repository. The Topology Web service is a user facing web page that exports topology data. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Users are required to have a GitHub account and must be whitelisted by OSG Technology group staff in order to request changes in the public repository. Changes to registration are reviewed and applied by OSG staff. Only a limited number of OSG Technology staff members have read and write access to the private contact repository. Data exported by topology web service publicly readable, but x509 certificate is required to view private contact details (email, phone). Service Availability Availability Definition https://topology.opensciencegrid.org/rgdowntime/xml is accessible Target Availability: 95%","title":"Topology"},{"location":"SLA/topology/#topology-service-level-agreement","text":"","title":"Topology Service Level Agreement"},{"location":"SLA/topology/#service-names","text":"Topology, Topology Web service","title":"Service Name(s)"},{"location":"SLA/topology/#description","text":"The Topology service holds information about people and resources involved in the OSG. Projects, site resources and downtime information are stored in flat text files in a public GitHub repository. Private data such as contact information is stored in a private repository. The Topology Web service is a user facing web page that exports topology data.","title":"Description"},{"location":"SLA/topology/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/topology/#security-considerations","text":"Users are required to have a GitHub account and must be whitelisted by OSG Technology group staff in order to request changes in the public repository. Changes to registration are reviewed and applied by OSG staff. Only a limited number of OSG Technology staff members have read and write access to the private contact repository. Data exported by topology web service publicly readable, but x509 certificate is required to view private contact details (email, phone).","title":"Security Considerations"},{"location":"SLA/topology/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/topology/#availability-definition","text":"https://topology.opensciencegrid.org/rgdowntime/xml is accessible","title":"Availability Definition"},{"location":"SLA/topology/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/web-pages/","text":"Web Pages Service Level Agreement Service Name(s) Display, Map, OSG Connect web pages Description User facing web pages that display usage statistics, and homepage for OSG Connect General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All information in the Display and Map is publicly readable. Write access is not allowed for non OSG staff. Service Availability Availability Definition Test web pages are accessible, certificates are valid for pages requiring them Target Availability: 95%","title":"Web Pages"},{"location":"SLA/web-pages/#web-pages-service-level-agreement","text":"","title":"Web Pages Service Level Agreement"},{"location":"SLA/web-pages/#service-names","text":"Display, Map, OSG Connect web pages","title":"Service Name(s)"},{"location":"SLA/web-pages/#description","text":"User facing web pages that display usage statistics, and homepage for OSG Connect","title":"Description"},{"location":"SLA/web-pages/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/web-pages/#security-considerations","text":"All information in the Display and Map is publicly readable. Write access is not allowed for non OSG staff.","title":"Security Considerations"},{"location":"SLA/web-pages/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/web-pages/#availability-definition","text":"Test web pages are accessible, certificates are valid for pages requiring them","title":"Availability Definition"},{"location":"SLA/web-pages/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/xdlogin/","text":"XD-Login Service Level Agreement Version Control Version Number Date Author Comments 0.1 2-18-2013 Scott Teige First Draft Executive Summary This SLA is an agreement between OSG Operations, Information Sciences Institute (ISI) and the OSG Management and Stakeholders describing details of the OSG-XD job submission system. Owners This SLA is owned by OSG Operations, Indiana University and ISI and will be reviewed and agreed upon by the OSG Executive Team and OSG Stakeholders. Service Name and Description Name The service is built on two physical machines hosted by the GOC. These are the production XD-Login job submission node and an adjunct machine, OSG-FLOCK. Description The XD-Login submission node allows OSG resources to be accessed by XSEDE users. A complete description is available here. Security Considerations The XD-Login submission node supports Single Sign On through the XSEDE User Portal, and also from the command line using gsissh with a grid certificate for authentication. OSG-FLOCK is not publicly accesible. Service Target Response Priorities and Response Times This section deals with unplanned outages. Please see Requests for Service Enhancement for information on planned maintenance outages. Critical High Elevated Normal Work Outage * * * * * * This Service does not have critical priority The issue causes a full service outage rendering the service unavailable The issue causes short (less than 15 minute) periods of unstable or inconsistent performance The issue causes minor (less than 5 minutes) periods of unstable or inconsistent performance Number of Clients Affected * * * * * * N/A The issue affects all users The issue may or may not affect all users The issue affects only a small number of users Response Time * * * * * * N/A Within the next business day Within the next business day Within five (5) business days Resolution Time * * * * * * N/A The maximum acceptable resolution time is one full (1) business day The maximum acceptable resolution time is five (5) business days The maximum acceptable resolution time is thirty (30) business days Escalates Every * * * * * * N/A One Day One Week One Month Escalation Contacts Escalation Level OSG Contact 1st OSG Operations Infrastructure Lead 2nd OSG Operations Coordinator 3rd OSG Production Coordinator 4th OSG Technical Director and Executive Director Detailed information on contacts are viewable on the following MyOSG URL , and are maintained within the Any ongoing \"Normal\" or \"Elevated\" level issues will be discussed at the weekly Operations and Production ) meetings. Service Availability and Outages The GOC will strive for 97% service availability. If service availability falls below 97% monthly as monitored by the GOC on two consecutive months a root cause analysis and service plan will be submitted to the OSG stakeholders for plans to restore an acceptable level of service availability. Service Support Hours The service is supported 24x7 by the GOC and Indiana University. All issues will be investigated by the next business day. Service Off-Hours Support Procedures XSEDE users should report problems via the XSEDE trouble ticket system. Others should contact the GOC via the Requests for Service Enhancements This section deals with planned maintenance outages. Please see Service Target Response Priorities and Response Times for information on unplanned outages. The OSG Operations will respond to customer requests for service enhancements based on GOC determination of the necessity and desirability of the enhancement. The GOC reserves the right to enhance the physical environment of the service based on IU and GOC needs. No enhancement will occur without advanced notice to the OSG community. Customer Problem Reporting The GOC provides operators 24x7x365. Service problems should be reported immediately by one of the following mechanisms. Creating a problem ticket at https://tickets.xsede.org ( preferred for XSEDE users ) Creating a problem ticket at https://ticket.grid.iu.edu/goc/submit ( preferred for other users ) Emailing a description to XD-Login-support@opensciencegrid.org Calling the GOC phone at 317-278-9699 Responsibilities Customer Responsibilities XD-Login customers agree to: Use the service for purposes of XSEDE or OSG approved work only. Alert the GOC if they are going to use the Service in a non-standard way, this includes testing or anticipated mass increases in usage. Contact support by means outlined in the Customer Problem Reporting section of this document if they encounter any service issues. Be willing and available to provide information within one business day for any High level issues reported. OSG Operations Responsibilities GOC operations: * Maintain the physical machine hosting the service * Assure the service is accessible via its advertised URL * Make changes and updates within the normal GOC release schedule * Meet response times associated with the priority assigned by users for issues related to the hardware. * Maintain appropriately trained staff. * The OSG and GOC are not responsible if a customer does not provide testing during the testing period. In such cases, the GOC has final discretion in what remedial actions to take. ISI: * Is responsible for all aspects of the service not listed above. GOC Service Desk Responsibilities: * Log and track all Customer requests for service through the OSG ticketing system. Database & Application Services responsibilities: * Announce and negotiate maintenance with stakeholders to assure minimal interruption to normal workload. * Alert the community of scheduled maintenance periods at least five (5) business days prior to the start of a service affecting maintenance window. Service Measuring and Reporting The GOC will provide the customer with the following reports in the intervals indicated (monthly, quarterly, semi-annually, or annually): Report Name Reporting Interval Delivery Method Responsible Party System Uptime Monthly Web Posting GOC Service Uptime Monthly Web Posting GOC Report of Critical and High Priority Issues Quarterly Web Posting GOC These reports will be posted in Appendix E of this document. SLA Validity Period This SLA will be in affect for one year. SLA Review Procedure This SLA will renew automatically on a yearly basis unless change or update is requested by the OSG Operations Coordinator, a representitive of ISI, the OSG Executive Team or the Stakeholders. References Appendix A - Customer Information All service end-users who are members of an OSG VO and OSG Staff are considered customers. All XSEDE staff and all XSEDE users with an OSG allocation are also customers. Appendix B - Other Service Dependencies The service is dependent on the following services to collect and distribute information: * Local Network and Hardware Appendix C - Supported Hardware and Software Supported Hardware The following hardware is supported: * Physical devices used to provide the service. * Physical devices used to provide the environment used to house the service. Hardware Services The following hardware services are provided: * Recommendations. OSG Operations will be responsible for specifying and recommending for purchase or lease hardware meeting customers' needs. * Installation. OSG Operations will install, configure and customize system hardware and operating systems. * Upgrades. OSG Operations is responsible for specifying and recommending for purchase any hardware upgrades. * Diagnosis. OSG Operations will diagnose problems with service related hardware. * Repair. OSG Operations analysts are not hardware technicians and receive no training in hardware maintenance, nor do we have the test equipment and tools necessary to do such work. Performing repairs under warranty: Any work to be performed under warranty may be referred to the warranty service provider at the discretion of the Service Provider analyst(s). Service Provider analysts will not undertake work that will void warranties on customer hardware unless specifically requested and authorized by customer's management in writing. Obtaining repair services: The Service Provider analyst will recommend a service vendor whenever he/she feels the repair work requires specialized skills or tools. Backup. Service Provider agrees to fully back up all Service Provider-supported software and data nightly every business day. Software Services Service Provider agrees to cover software support services, including software installations and upgrades. All software maintenance periods will be announced via the policy put forth in the OSG Operations Responsibilities section of this document. Software Costs IU and the Grid Operations Center bears all costs for new and replacement software. Appendix D - Approval Approved By Position Date Appendix E - Metric Reports [[ServiceLevelAgreements#Supporting_Documents][Recent availability statistics]]","title":"XDLogin"},{"location":"SLA/xdlogin/#xd-login-service-level-agreement","text":"","title":"XD-Login Service Level Agreement"},{"location":"SLA/xdlogin/#version-control","text":"Version Number Date Author Comments 0.1 2-18-2013 Scott Teige First Draft","title":"Version Control"},{"location":"SLA/xdlogin/#executive-summary","text":"This SLA is an agreement between OSG Operations, Information Sciences Institute (ISI) and the OSG Management and Stakeholders describing details of the OSG-XD job submission system.","title":"Executive Summary"},{"location":"SLA/xdlogin/#owners","text":"This SLA is owned by OSG Operations, Indiana University and ISI and will be reviewed and agreed upon by the OSG Executive Team and OSG Stakeholders.","title":"Owners"},{"location":"SLA/xdlogin/#service-name-and-description","text":"","title":"Service Name and Description"},{"location":"SLA/xdlogin/#name","text":"The service is built on two physical machines hosted by the GOC. These are the production XD-Login job submission node and an adjunct machine, OSG-FLOCK.","title":"Name"},{"location":"SLA/xdlogin/#description","text":"The XD-Login submission node allows OSG resources to be accessed by XSEDE users. A complete description is available here.","title":"Description"},{"location":"SLA/xdlogin/#security-considerations","text":"The XD-Login submission node supports Single Sign On through the XSEDE User Portal, and also from the command line using gsissh with a grid certificate for authentication. OSG-FLOCK is not publicly accesible.","title":"Security Considerations"},{"location":"SLA/xdlogin/#service-target-response-priorities-and-response-times","text":"This section deals with unplanned outages. Please see Requests for Service Enhancement for information on planned maintenance outages. Critical High Elevated Normal Work Outage * * * * * * This Service does not have critical priority The issue causes a full service outage rendering the service unavailable The issue causes short (less than 15 minute) periods of unstable or inconsistent performance The issue causes minor (less than 5 minutes) periods of unstable or inconsistent performance Number of Clients Affected * * * * * * N/A The issue affects all users The issue may or may not affect all users The issue affects only a small number of users Response Time * * * * * * N/A Within the next business day Within the next business day Within five (5) business days Resolution Time * * * * * * N/A The maximum acceptable resolution time is one full (1) business day The maximum acceptable resolution time is five (5) business days The maximum acceptable resolution time is thirty (30) business days Escalates Every * * * * * * N/A One Day One Week One Month","title":"Service Target Response Priorities and Response Times"},{"location":"SLA/xdlogin/#escalation-contacts","text":"Escalation Level OSG Contact 1st OSG Operations Infrastructure Lead 2nd OSG Operations Coordinator 3rd OSG Production Coordinator 4th OSG Technical Director and Executive Director Detailed information on contacts are viewable on the following MyOSG URL , and are maintained within the Any ongoing \"Normal\" or \"Elevated\" level issues will be discussed at the weekly Operations and Production ) meetings.","title":"Escalation Contacts"},{"location":"SLA/xdlogin/#service-availability-and-outages","text":"The GOC will strive for 97% service availability. If service availability falls below 97% monthly as monitored by the GOC on two consecutive months a root cause analysis and service plan will be submitted to the OSG stakeholders for plans to restore an acceptable level of service availability.","title":"Service Availability and Outages"},{"location":"SLA/xdlogin/#service-support-hours","text":"The service is supported 24x7 by the GOC and Indiana University. All issues will be investigated by the next business day.","title":"Service Support Hours"},{"location":"SLA/xdlogin/#service-off-hours-support-procedures","text":"XSEDE users should report problems via the XSEDE trouble ticket system. Others should contact the GOC via the","title":"Service Off-Hours Support Procedures"},{"location":"SLA/xdlogin/#requests-for-service-enhancements","text":"This section deals with planned maintenance outages. Please see Service Target Response Priorities and Response Times for information on unplanned outages. The OSG Operations will respond to customer requests for service enhancements based on GOC determination of the necessity and desirability of the enhancement. The GOC reserves the right to enhance the physical environment of the service based on IU and GOC needs. No enhancement will occur without advanced notice to the OSG community.","title":"Requests for Service Enhancements"},{"location":"SLA/xdlogin/#customer-problem-reporting","text":"The GOC provides operators 24x7x365. Service problems should be reported immediately by one of the following mechanisms. Creating a problem ticket at https://tickets.xsede.org ( preferred for XSEDE users ) Creating a problem ticket at https://ticket.grid.iu.edu/goc/submit ( preferred for other users ) Emailing a description to XD-Login-support@opensciencegrid.org Calling the GOC phone at 317-278-9699","title":"Customer Problem Reporting"},{"location":"SLA/xdlogin/#responsibilities","text":"","title":"Responsibilities"},{"location":"SLA/xdlogin/#customer-responsibilities","text":"XD-Login customers agree to: Use the service for purposes of XSEDE or OSG approved work only. Alert the GOC if they are going to use the Service in a non-standard way, this includes testing or anticipated mass increases in usage. Contact support by means outlined in the Customer Problem Reporting section of this document if they encounter any service issues. Be willing and available to provide information within one business day for any High level issues reported.","title":"Customer Responsibilities"},{"location":"SLA/xdlogin/#osg-operations-responsibilities","text":"GOC operations: * Maintain the physical machine hosting the service * Assure the service is accessible via its advertised URL * Make changes and updates within the normal GOC release schedule * Meet response times associated with the priority assigned by users for issues related to the hardware. * Maintain appropriately trained staff. * The OSG and GOC are not responsible if a customer does not provide testing during the testing period. In such cases, the GOC has final discretion in what remedial actions to take. ISI: * Is responsible for all aspects of the service not listed above. GOC Service Desk Responsibilities: * Log and track all Customer requests for service through the OSG ticketing system. Database & Application Services responsibilities: * Announce and negotiate maintenance with stakeholders to assure minimal interruption to normal workload. * Alert the community of scheduled maintenance periods at least five (5) business days prior to the start of a service affecting maintenance window.","title":"OSG Operations Responsibilities"},{"location":"SLA/xdlogin/#service-measuring-and-reporting","text":"The GOC will provide the customer with the following reports in the intervals indicated (monthly, quarterly, semi-annually, or annually): Report Name Reporting Interval Delivery Method Responsible Party System Uptime Monthly Web Posting GOC Service Uptime Monthly Web Posting GOC Report of Critical and High Priority Issues Quarterly Web Posting GOC These reports will be posted in Appendix E of this document.","title":"Service Measuring and Reporting"},{"location":"SLA/xdlogin/#sla-validity-period","text":"This SLA will be in affect for one year.","title":"SLA Validity Period"},{"location":"SLA/xdlogin/#sla-review-procedure","text":"This SLA will renew automatically on a yearly basis unless change or update is requested by the OSG Operations Coordinator, a representitive of ISI, the OSG Executive Team or the Stakeholders.","title":"SLA Review Procedure"},{"location":"SLA/xdlogin/#references","text":"","title":"References"},{"location":"SLA/xdlogin/#appendix-a-customer-information","text":"All service end-users who are members of an OSG VO and OSG Staff are considered customers. All XSEDE staff and all XSEDE users with an OSG allocation are also customers.","title":"Appendix A - Customer Information"},{"location":"SLA/xdlogin/#appendix-b-other-service-dependencies","text":"The service is dependent on the following services to collect and distribute information: * Local Network and Hardware","title":"Appendix B - Other Service Dependencies"},{"location":"SLA/xdlogin/#appendix-c-supported-hardware-and-software","text":"","title":"Appendix C - Supported Hardware and Software"},{"location":"SLA/xdlogin/#supported-hardware","text":"The following hardware is supported: * Physical devices used to provide the service. * Physical devices used to provide the environment used to house the service.","title":"Supported Hardware"},{"location":"SLA/xdlogin/#hardware-services","text":"The following hardware services are provided: * Recommendations. OSG Operations will be responsible for specifying and recommending for purchase or lease hardware meeting customers' needs. * Installation. OSG Operations will install, configure and customize system hardware and operating systems. * Upgrades. OSG Operations is responsible for specifying and recommending for purchase any hardware upgrades. * Diagnosis. OSG Operations will diagnose problems with service related hardware. * Repair. OSG Operations analysts are not hardware technicians and receive no training in hardware maintenance, nor do we have the test equipment and tools necessary to do such work. Performing repairs under warranty: Any work to be performed under warranty may be referred to the warranty service provider at the discretion of the Service Provider analyst(s). Service Provider analysts will not undertake work that will void warranties on customer hardware unless specifically requested and authorized by customer's management in writing. Obtaining repair services: The Service Provider analyst will recommend a service vendor whenever he/she feels the repair work requires specialized skills or tools. Backup. Service Provider agrees to fully back up all Service Provider-supported software and data nightly every business day.","title":"Hardware Services"},{"location":"SLA/xdlogin/#software-services","text":"Service Provider agrees to cover software support services, including software installations and upgrades. All software maintenance periods will be announced via the policy put forth in the OSG Operations Responsibilities section of this document.","title":"Software Services"},{"location":"SLA/xdlogin/#software-costs","text":"IU and the Grid Operations Center bears all costs for new and replacement software.","title":"Software Costs"},{"location":"SLA/xdlogin/#appendix-d-approval","text":"Approved By Position Date","title":"Appendix D - Approval"},{"location":"SLA/xdlogin/#appendix-e-metric-reports","text":"[[ServiceLevelAgreements#Supporting_Documents][Recent availability statistics]]","title":"Appendix E - Metric Reports"},{"location":"ServiceManagement/ResponsibilityMatrix/","text":"Responsibility Matrix Service Owner Support Address Area Coordinator Contact Service Catalog Service Portfolio Display IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link Internal Link GlideIn-Factory UCSD help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link GRACC UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Link Messaging Service UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Hosted-CE UC user-support@opensciencegrid.org Rob Gardner (rwg@uchicago.edu) Link OIM IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link CVMFS Stratum-1 UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) CVMFS Repositories (*.osgstorage.org) UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu)","title":"Responsibility Matrix"},{"location":"ServiceManagement/ResponsibilityMatrix/#responsibility-matrix","text":"Service Owner Support Address Area Coordinator Contact Service Catalog Service Portfolio Display IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link Internal Link GlideIn-Factory UCSD help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link GRACC UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Link Messaging Service UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Hosted-CE UC user-support@opensciencegrid.org Rob Gardner (rwg@uchicago.edu) Link OIM IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link CVMFS Stratum-1 UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) CVMFS Repositories (*.osgstorage.org) UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu)","title":"Responsibility Matrix"},{"location":"ServiceManagement/placeholder/","text":"Placeholder file.","title":"Placeholder"},{"location":"ServiceManagement/ServiceCatalog/SC_Display/","text":"Service Catalog for the OSG Display Service Service Catalog OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM","title":"Service Catalog for the OSG Display Service"},{"location":"ServiceManagement/ServiceCatalog/SC_Display/#service-catalog-for-the-osg-display-service","text":"Service Catalog OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM","title":"Service Catalog for the OSG Display Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GRACC/","text":"Service Catalog for the OSG GRACC (Accounting) Service Service Catalog OSG GRACC Service Name OSG GRACC Service Status Active Service Description GRACC serves as a repository for OSG accounting and monitoring data, integrating disparate data sources such as job accounting, transfer accounting, and network performance monitoring. Service Users WLCG accounting, OSG management, network engineers, and VOs wanting to monitor resource usage Service Area OSG Technology Service Level Agreement In progress Support Unit OSG Operations and Technology Support Contact Address help@opensciencegrid.org Dependencies RabbitMQ, RSV-perfSONAR, UNL HDFS, FNAL dCache, OIM","title":"Service Catalog for the OSG GRACC (Accounting) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GRACC/#service-catalog-for-the-osg-gracc-accounting-service","text":"Service Catalog OSG GRACC Service Name OSG GRACC Service Status Active Service Description GRACC serves as a repository for OSG accounting and monitoring data, integrating disparate data sources such as job accounting, transfer accounting, and network performance monitoring. Service Users WLCG accounting, OSG management, network engineers, and VOs wanting to monitor resource usage Service Area OSG Technology Service Level Agreement In progress Support Unit OSG Operations and Technology Support Contact Address help@opensciencegrid.org Dependencies RabbitMQ, RSV-perfSONAR, UNL HDFS, FNAL dCache, OIM","title":"Service Catalog for the OSG GRACC (Accounting) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GlideInFactory/","text":"Service Catalog for the OSG GlideIn Factory Service Service Catalog OSG GlideIn Factory Service Name OSG GlideIn Factory Service Status Active Service Description The OSG GlideIn Factory claims compute resources at a site on the request of the VO Frontend, and the resources in turn join the VO HTCondor Pool and become available for end users to run on. Service Users 12 OSG VOs (listed in SLA) Service Area OSG Operations at UCSD Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GlideinWMS, HTCondor, python, httpd, osg-ca-certs, fetch-crl, javascriptrrd, m2crypto","title":"Service Catalog for the OSG GlideIn Factory Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GlideInFactory/#service-catalog-for-the-osg-glidein-factory-service","text":"Service Catalog OSG GlideIn Factory Service Name OSG GlideIn Factory Service Status Active Service Description The OSG GlideIn Factory claims compute resources at a site on the request of the VO Frontend, and the resources in turn join the VO HTCondor Pool and become available for end users to run on. Service Users 12 OSG VOs (listed in SLA) Service Area OSG Operations at UCSD Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GlideinWMS, HTCondor, python, httpd, osg-ca-certs, fetch-crl, javascriptrrd, m2crypto","title":"Service Catalog for the OSG GlideIn Factory Service"},{"location":"ServiceManagement/ServiceCatalog/SC_HostedCE/","text":"Service Catalog for the OSG Hosted-CE Service Service Catalog OSG Hosted-CE Service Name OSG Hosted-CE Service Status Active Service Description Provides Compute Element (CE) interface to campus compute clusters. Service Users OSG and other VO Glidein Factories Service Area OSG User Support Service Level Agreement N/A Support Contact Address user-support@opensciencegrid.org Dependencies HTCondor","title":"Service Catalog for the OSG Hosted-CE Service"},{"location":"ServiceManagement/ServiceCatalog/SC_HostedCE/#service-catalog-for-the-osg-hosted-ce-service","text":"Service Catalog OSG Hosted-CE Service Name OSG Hosted-CE Service Status Active Service Description Provides Compute Element (CE) interface to campus compute clusters. Service Users OSG and other VO Glidein Factories Service Area OSG User Support Service Level Agreement N/A Support Contact Address user-support@opensciencegrid.org Dependencies HTCondor","title":"Service Catalog for the OSG Hosted-CE Service"},{"location":"ServiceManagement/ServiceCatalog/SC_OIM/","text":"Service Catalog for the OSG Information Management (OIM) Service Service Catalog OIM Service Name OIM Service Status Active Service Description The OIM service is the topology management system for the OSG. It holds information about people and resources involved in the OSG. The OIM service consists of a MySQL database and a web-based API. OIM is also used to satisfy certificate requests made by OSG users. Service Users Anyone with responsibilities for being contact point for an OSG resource. Those users who need certificates. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies MySQL, Apache","title":"Service Catalog for the OSG Information Management (OIM) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_OIM/#service-catalog-for-the-osg-information-management-oim-service","text":"Service Catalog OIM Service Name OIM Service Status Active Service Description The OIM service is the topology management system for the OSG. It holds information about people and resources involved in the OSG. The OIM service consists of a MySQL database and a web-based API. OIM is also used to satisfy certificate requests made by OSG users. Service Users Anyone with responsibilities for being contact point for an OSG resource. Those users who need certificates. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies MySQL, Apache","title":"Service Catalog for the OSG Information Management (OIM) Service"},{"location":"ServiceManagement/ServiceProfile/SP_Display/","text":"Service Portfolio for the OSG Display Service Service Portfolio OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Problem Addressed Provide a simple near real time visualization of OSG activity, that can be understood by anyone with interest in OSG usage. Competitors and Similar Services Unique Selling Points Layperson view of overall OSG activity. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Additional Service Components Monitoring, Availability calculation, Configuration Management Database entries. Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM Cost to Provide Unused Funding Source Unused Pricing Unused","title":"Service Portfolio for the OSG Display Service"},{"location":"ServiceManagement/ServiceProfile/SP_Display/#service-portfolio-for-the-osg-display-service","text":"Service Portfolio OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Problem Addressed Provide a simple near real time visualization of OSG activity, that can be understood by anyone with interest in OSG usage. Competitors and Similar Services Unique Selling Points Layperson view of overall OSG activity. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Additional Service Components Monitoring, Availability calculation, Configuration Management Database entries. Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM Cost to Provide Unused Funding Source Unused Pricing Unused","title":"Service Portfolio for the OSG Display Service"},{"location":"services/adding-external-cvmfs-repos/","text":"Adding External CVMFS Repos This document describes how to add an external repo like public-uc.osgstorage.org or ligo.storage.igwn.org. Use OSG Docs Authoritative source for adding new oasis repo is Install an OASIS Repository . Follow instructions starting at heading \"Creating a Repository\", your first command should be cvmfs_server mkfs When it asks you to open a ticket, if it's an osgstorage.org or opensciencegrid.org domain, then all you need to do is add the CVMFS repo to topology like: https://github.com/opensciencegrid/topology/pull/3986 Once you have completed adding the fetch-cvmfs-whitelist line to cron, you are done with the OSG documentation. Configure the external CVMFS repo In /etc/cvmfs/repositories.d/<reponame>/server.conf , add the lines: CVMFS_COMPRESSION_ALGORITHM=none CVMFS_GARBAGE_COLLECTION=true CVMFS_AUTO_GC=true CVMFS_AUTO_GC_TIMESPAN=\"2 days ago\" CVMFS_EXTERNAL_DATA=true CVMFS_AUTO_TAG_TIMESPAN=\"2 weeks ago\" Check in the file for duplicate lines of the above with different settings, comment ( # ) out those lines. The imporatant line is CVMFS_COMPRESSION_ALGORITHM . If it is set to the default, then CVMFS clients will expect the data to be delivered in compressed format, while the caches will deliver the file in un-compressed format. Configure CVMFS-sync cvmfs-sync synchonizes the data from an XRootD server (origin) to a CVMFS repo. Create a new config file (by copying another existing config) in /etc/cvmfs-sync . The name of the configuration should be <reponame>.config In the config, you will need to modify the repo, source and destination. This is where cvmfs-sync will scan for new files to add to the CVMFS repo. Make the systemd timer Copy an existing timer like: cp - r / etc / systemd / system / cvmfs - data - update @gwosc . osgstorage . org . service . d / etc / systemd / system / cvmfs - data - update @ < reponame > . service . d You may need to edit the override file in the directory above to change the user. Enable the timer: systemctl enable cvmfs-data-update@<reponame>.timer systemctl start cvmfs-data-update@<reponame>.timer Checking cvmfs-sync journalctl -u cvmfs-data-update@<reponame>","title":"Adding External CVMFS repos"},{"location":"services/adding-external-cvmfs-repos/#adding-external-cvmfs-repos","text":"This document describes how to add an external repo like public-uc.osgstorage.org or ligo.storage.igwn.org.","title":"Adding External CVMFS Repos"},{"location":"services/adding-external-cvmfs-repos/#use-osg-docs","text":"Authoritative source for adding new oasis repo is Install an OASIS Repository . Follow instructions starting at heading \"Creating a Repository\", your first command should be cvmfs_server mkfs When it asks you to open a ticket, if it's an osgstorage.org or opensciencegrid.org domain, then all you need to do is add the CVMFS repo to topology like: https://github.com/opensciencegrid/topology/pull/3986 Once you have completed adding the fetch-cvmfs-whitelist line to cron, you are done with the OSG documentation.","title":"Use OSG Docs"},{"location":"services/adding-external-cvmfs-repos/#configure-the-external-cvmfs-repo","text":"In /etc/cvmfs/repositories.d/<reponame>/server.conf , add the lines: CVMFS_COMPRESSION_ALGORITHM=none CVMFS_GARBAGE_COLLECTION=true CVMFS_AUTO_GC=true CVMFS_AUTO_GC_TIMESPAN=\"2 days ago\" CVMFS_EXTERNAL_DATA=true CVMFS_AUTO_TAG_TIMESPAN=\"2 weeks ago\" Check in the file for duplicate lines of the above with different settings, comment ( # ) out those lines. The imporatant line is CVMFS_COMPRESSION_ALGORITHM . If it is set to the default, then CVMFS clients will expect the data to be delivered in compressed format, while the caches will deliver the file in un-compressed format.","title":"Configure the external CVMFS repo"},{"location":"services/adding-external-cvmfs-repos/#configure-cvmfs-sync","text":"cvmfs-sync synchonizes the data from an XRootD server (origin) to a CVMFS repo. Create a new config file (by copying another existing config) in /etc/cvmfs-sync . The name of the configuration should be <reponame>.config In the config, you will need to modify the repo, source and destination. This is where cvmfs-sync will scan for new files to add to the CVMFS repo.","title":"Configure CVMFS-sync"},{"location":"services/adding-external-cvmfs-repos/#make-the-systemd-timer","text":"Copy an existing timer like: cp - r / etc / systemd / system / cvmfs - data - update @gwosc . osgstorage . org . service . d / etc / systemd / system / cvmfs - data - update @ < reponame > . service . d You may need to edit the override file in the directory above to change the user. Enable the timer: systemctl enable cvmfs-data-update@<reponame>.timer systemctl start cvmfs-data-update@<reponame>.timer","title":"Make the systemd timer"},{"location":"services/adding-external-cvmfs-repos/#checking-cvmfs-sync","text":"journalctl -u cvmfs-data-update@<reponame>","title":"Checking cvmfs-sync"},{"location":"services/ce-monitoring-dashboards/","text":"CE Monitoring Dashboards Links to CE Monitoring Dashboards: Ganglia - Miron GlideIn View OSG CPU/GPU Hours Table GRACC: GPU Utilization by Project / Site GRACC: OSG GPU Payload Jobs Summary","title":"CE Monitoring Dashboards"},{"location":"services/ce-monitoring-dashboards/#ce-monitoring-dashboards","text":"Links to CE Monitoring Dashboards: Ganglia - Miron GlideIn View OSG CPU/GPU Hours Table GRACC: GPU Utilization by Project / Site GRACC: OSG GPU Payload Jobs Summary","title":"CE Monitoring Dashboards"},{"location":"services/finalize-cache-registration/","text":"Finalizing New Cache Registration Once a new cache is registered with OSG, there are additional operations tasks that must be performed before it is usable by clients. The steps on this page are for OSG Operations; sysadmins should follow the cache registration document and open a support ticket to have these steps executed. Un-Authenticated Cache Test to make sure the cache is working by executing the following: console $ curl http://hcc-stash.unl.edu:8000/user/rynge/public/test.txt Hello! Open a pull request to add the cache to https://github.com/opensciencegrid/StashCache/blob/master/bin/caches.json (obsolete) file within the StashCache repo. Open a pull request adding the cache to CVMFS_EXTERNAL_URL in the https://github.com/opensciencegrid/oasis-server/blob/master/goc/config-osg/etc/cvmfs/domain.d/osgstorage.org.conf (obsolete) file. Authenticated Cache For an authenticated cache, it will need to be added to the specific CVMFS configuration for the authenticated domain. For example, if it is a LIGO authenticated cache, it will need to be added to the CVMFS_EXTERNAL_URL within the ligo.osgstorage.org.conf file in the https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete) directory. A CMS authenticated cache will need to be added to the cms.osgstorage.org.conf file Open a pull request adding the authenticated cache to CVMFS_EXTERNAL_URL in the appropriate domain configuration file within https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete). Coordinate with the VO to test that authorization works. As each VO is expected to export a different directory and require different authorizations, a custom test must be arranged each time.","title":"Finalize Cache Registration"},{"location":"services/finalize-cache-registration/#finalizing-new-cache-registration","text":"Once a new cache is registered with OSG, there are additional operations tasks that must be performed before it is usable by clients. The steps on this page are for OSG Operations; sysadmins should follow the cache registration document and open a support ticket to have these steps executed.","title":"Finalizing New Cache Registration"},{"location":"services/finalize-cache-registration/#un-authenticated-cache","text":"Test to make sure the cache is working by executing the following: console $ curl http://hcc-stash.unl.edu:8000/user/rynge/public/test.txt Hello! Open a pull request to add the cache to https://github.com/opensciencegrid/StashCache/blob/master/bin/caches.json (obsolete) file within the StashCache repo. Open a pull request adding the cache to CVMFS_EXTERNAL_URL in the https://github.com/opensciencegrid/oasis-server/blob/master/goc/config-osg/etc/cvmfs/domain.d/osgstorage.org.conf (obsolete) file.","title":"Un-Authenticated Cache"},{"location":"services/finalize-cache-registration/#authenticated-cache","text":"For an authenticated cache, it will need to be added to the specific CVMFS configuration for the authenticated domain. For example, if it is a LIGO authenticated cache, it will need to be added to the CVMFS_EXTERNAL_URL within the ligo.osgstorage.org.conf file in the https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete) directory. A CMS authenticated cache will need to be added to the cms.osgstorage.org.conf file Open a pull request adding the authenticated cache to CVMFS_EXTERNAL_URL in the appropriate domain configuration file within https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete). Coordinate with the VO to test that authorization works. As each VO is expected to export a different directory and require different authorizations, a custom test must be arranged each time.","title":"Authenticated Cache"},{"location":"services/gracc-corrections/","text":"Installing GRACC Corrections GRACC Corrections are used to modify records during the summarization process. RAW records are not modified in the correction process. The correction is applied after summarization and aggregation, but before the record is enriched with data from Topology . The correction is step 3 in the GRACC summary record workflow: Raw record is received. The raw record is never modified Summarizer aggregates the raw records Corrections are applied Summarized records are enriched by Topology Summarized and enriched records are uploaded to GRACC We can currently correct: VO Names Project Names OIM_Site (using the Host_description field) Limitations Additional corrections can be written, but some attributes are used to detect duplicate records, and are therefore protected from corrections. Protected records for summarization are: EndTime, RawVOName, RawProjectName, DN, Processors, ResourceType, CommonName, Host_description, Resource_ExitCode, Grid, ReportableVOName, ProbeName For example, we could not write a correction for the Host_description . If we had a correction that changed Host_description , then the duplicate detection would not detect the same record during resummarization and it would have duplicate summarized records. Command Line The gracc-correct tool is used to create, update, and delete corrections. The tool must be run from a host that can write to GRACC, which is very restricted. It is recommended to run the gracc-correct tool directly from the gracc.opensciencegrid.org host. The gracc-correct tool is able to parse new corrections either individually from user input or many at once from a CSV file. User Input Each correction attempts to match one or more attributes of the summarized record in order to set another attribute. For example, for the VO correction: $ gracc-correct vo add Field ( s ) to correct: VOName: <vo> ReportableVOName: <reportable_vo> Corrected VOName: <new_vo_name> CSV File A CSV file can be specified in order to specify multiple corrections in a single batch update. The CSV file must be of a certain format. No Header Row The number of columns must be at least the number of matching attributes and the corrected attribute. For example, a CSV file for VO corrections would be of format: <VOName>,<ReportableVOName>,<CorrectedVOName>,.... The CSV file can be specified on the command line with the option --csv , for example: ./gracc-correct vo add --csv <csv_file>","title":"GRACC Corrections"},{"location":"services/gracc-corrections/#installing-gracc-corrections","text":"GRACC Corrections are used to modify records during the summarization process. RAW records are not modified in the correction process. The correction is applied after summarization and aggregation, but before the record is enriched with data from Topology . The correction is step 3 in the GRACC summary record workflow: Raw record is received. The raw record is never modified Summarizer aggregates the raw records Corrections are applied Summarized records are enriched by Topology Summarized and enriched records are uploaded to GRACC We can currently correct: VO Names Project Names OIM_Site (using the Host_description field)","title":"Installing GRACC Corrections"},{"location":"services/gracc-corrections/#limitations","text":"Additional corrections can be written, but some attributes are used to detect duplicate records, and are therefore protected from corrections. Protected records for summarization are: EndTime, RawVOName, RawProjectName, DN, Processors, ResourceType, CommonName, Host_description, Resource_ExitCode, Grid, ReportableVOName, ProbeName For example, we could not write a correction for the Host_description . If we had a correction that changed Host_description , then the duplicate detection would not detect the same record during resummarization and it would have duplicate summarized records.","title":"Limitations"},{"location":"services/gracc-corrections/#command-line","text":"The gracc-correct tool is used to create, update, and delete corrections. The tool must be run from a host that can write to GRACC, which is very restricted. It is recommended to run the gracc-correct tool directly from the gracc.opensciencegrid.org host. The gracc-correct tool is able to parse new corrections either individually from user input or many at once from a CSV file.","title":"Command Line"},{"location":"services/gracc-corrections/#user-input","text":"Each correction attempts to match one or more attributes of the summarized record in order to set another attribute. For example, for the VO correction: $ gracc-correct vo add Field ( s ) to correct: VOName: <vo> ReportableVOName: <reportable_vo> Corrected VOName: <new_vo_name>","title":"User Input"},{"location":"services/gracc-corrections/#csv-file","text":"A CSV file can be specified in order to specify multiple corrections in a single batch update. The CSV file must be of a certain format. No Header Row The number of columns must be at least the number of matching attributes and the corrected attribute. For example, a CSV file for VO corrections would be of format: <VOName>,<ReportableVOName>,<CorrectedVOName>,.... The CSV file can be specified on the command line with the option --csv , for example: ./gracc-correct vo add --csv <csv_file>","title":"CSV File"},{"location":"services/hosted-ce-definitions/","text":"OSG Hosted CE Definitions The OSG provides a Hosted CE service. In general, this document lists what an instance of that service can and cannot do. Hosted CEs in General Benefits The site continues to operate its own batch system according to local considerations; OSG operates the interface between OSG and the site, aka the Hosted CE; To the site, OSG simply looks like a set of user accounts; and OSG uses the accounts to provision site resources for various science user communities, and hence the site has complete control over resource allocation via local policies on the accounts. Prerequisites In general, the site must operate a working batch system that is accessible via at least one head node; OSG works with HTCondor, Slurm, PBS Pro/Torque, LSF, and Grid Engine. Site operations include hardware and software maintenance, defining and implementing usage policies, monitoring, troubleshooting, etc. These are the same activities to support local users. In addition, the site: Must communicate with OSG their intent to share resources \u2014 in most cases, a meeting between site and OSG staff should be sufficient to discuss goals, plans, etc.; Must meet the technical requirements on the OSG website , summarized below: The site is willing to add OSG user accounts with inbound SSH access and submit privileges, A mechanism exists for transferring files between the head nodes and worker nodes, and Worker nodes must have outbound Internet access and temporary storage space for jobs. Is strongly encouraged to tell OSG about preferred constraints on resource requests (e.g., per-job limits on CPUs, memory, and storage; overall limits on number of running and idle jobs; submission rates), so that OSG can tailor such requests to better fit the site. Standard Hosted CE A Standard Hosted CE is the default case in which the interaction between OSG and the site is relatively simple and easy to maintain. Most sites fall into this category. Benefits Configuration is limited to basics, so there is less upfront and ongoing work for OSG and the site; OSG maintains and shares mappings from user groups to OSG user accounts on the site, so that the site can \u2014 if desired \u2014 limit resource allocations to certain groups; and OSG maintains the required OSG configuration on the site\u2019s head node and worker nodes (if the site provides a distribution mechanism to worker nodes, such as a shared file system). Site Responsibilities In addition to the general prerequisites above, the following apply to a Standard Hosted CE: The site must create and maintain 20 OSG user accounts on a single head node; note that: OSG will access their accounts via SSH using one RSA key for all 20 accounts; and All 20 OSG accounts must be able to submit to the local batch system. The site may control the resources allocated to different OSG user groups by writing and maintaining policies on the OSG user accounts within the batch system. The site provides privilege separation among the OSG user groups via the OSG user accounts and standard Unix privilege separation.","title":"Hosted CE Definitions"},{"location":"services/hosted-ce-definitions/#osg-hosted-ce-definitions","text":"The OSG provides a Hosted CE service. In general, this document lists what an instance of that service can and cannot do.","title":"OSG Hosted CE Definitions"},{"location":"services/hosted-ce-definitions/#hosted-ces-in-general","text":"","title":"Hosted CEs in General"},{"location":"services/hosted-ce-definitions/#benefits","text":"The site continues to operate its own batch system according to local considerations; OSG operates the interface between OSG and the site, aka the Hosted CE; To the site, OSG simply looks like a set of user accounts; and OSG uses the accounts to provision site resources for various science user communities, and hence the site has complete control over resource allocation via local policies on the accounts.","title":"Benefits"},{"location":"services/hosted-ce-definitions/#prerequisites","text":"In general, the site must operate a working batch system that is accessible via at least one head node; OSG works with HTCondor, Slurm, PBS Pro/Torque, LSF, and Grid Engine. Site operations include hardware and software maintenance, defining and implementing usage policies, monitoring, troubleshooting, etc. These are the same activities to support local users. In addition, the site: Must communicate with OSG their intent to share resources \u2014 in most cases, a meeting between site and OSG staff should be sufficient to discuss goals, plans, etc.; Must meet the technical requirements on the OSG website , summarized below: The site is willing to add OSG user accounts with inbound SSH access and submit privileges, A mechanism exists for transferring files between the head nodes and worker nodes, and Worker nodes must have outbound Internet access and temporary storage space for jobs. Is strongly encouraged to tell OSG about preferred constraints on resource requests (e.g., per-job limits on CPUs, memory, and storage; overall limits on number of running and idle jobs; submission rates), so that OSG can tailor such requests to better fit the site.","title":"Prerequisites"},{"location":"services/hosted-ce-definitions/#standard-hosted-ce","text":"A Standard Hosted CE is the default case in which the interaction between OSG and the site is relatively simple and easy to maintain. Most sites fall into this category.","title":"Standard Hosted CE"},{"location":"services/hosted-ce-definitions/#benefits_1","text":"Configuration is limited to basics, so there is less upfront and ongoing work for OSG and the site; OSG maintains and shares mappings from user groups to OSG user accounts on the site, so that the site can \u2014 if desired \u2014 limit resource allocations to certain groups; and OSG maintains the required OSG configuration on the site\u2019s head node and worker nodes (if the site provides a distribution mechanism to worker nodes, such as a shared file system).","title":"Benefits"},{"location":"services/hosted-ce-definitions/#site-responsibilities","text":"In addition to the general prerequisites above, the following apply to a Standard Hosted CE: The site must create and maintain 20 OSG user accounts on a single head node; note that: OSG will access their accounts via SSH using one RSA key for all 20 accounts; and All 20 OSG accounts must be able to submit to the local batch system. The site may control the resources allocated to different OSG user groups by writing and maintaining policies on the OSG user accounts within the batch system. The site provides privilege separation among the OSG user groups via the OSG user accounts and standard Unix privilege separation.","title":"Site Responsibilities"},{"location":"services/install-gwms-factory/","text":"GlideinWMS Factory Installation This document describes how to install a Glidein Workflow Managment System (GlideinWMS) Factory instance. This document assumes expertise with HTCondor and familiarity with the GlideinWMS software. It does not cover anything but the simplest possible install. Please consult the GlideinWMS reference documentation for advanced topics, including non-root, non-RPM-based installation. In this document the terms glidein and pilot (job) will be used interchangeably. This parts covers these primary components of the GlideinWMS system: WMS Collector / Schedd : A set of condor_collector and condor_schedd processes that allow the submission of pilots to Grid entries. GlideinWMS Factory : The process submitting the pilots when needed Warning We really recommend you to use the OSG provided Factory and not to install your own . A VO Frontend is sufficient to submit your jobs and to decide scheduling policies. And this will avoid for you the complexity to deal directly with grid/cloud sites. If you really need you own Factory be aware that it is a complex component and may require a non trivial maintenance effort. Before Starting Before starting the installation process, consider the following points (consulting the Reference section below as needed): Requirements Host and OS A host to install the GlideinWMS Factory (pristine node). Currently most of our testing has been done on Scientific Linux 6 and 7. Root access The GlideinWMS Factory has the following requirements: CPU : 4-8 cores for a large installation (1 should suffice on a small install) RAM : 4-8GB on a large installation (1GB should suffice for small installs) Disk : 10GB will be plenty sufficient for all the binaries, config and log files related to GlideinWMS. If you are a large site with need to keep significant history and logs, you may want to allocate 100GB+ to store long histories. Users The GlideinWMS Factory installation will create the following users unless they are already created . User Default uid Comment condor none HTCondor user (installed via dependencies). gfactory none This user runs the GlideinWMS VO factory. To verify that the user gfactory has gfactory as primary group check the output of root@host # getent passwd gfactory | cut -d: -f4 | xargs getent group It should be the gfactory group. Certificates Certificate User that owns certificate Path to certificate Host certificate root /etc/grid-security/hostcert.pem /etc/grid-security/hostkey.pem Here are instructions to request a host certificate. The host certificate/key is used for authorization, however, authorization between the Factory and the GlideinWMS collector is done by file system authentication. Networking Firewalls It must be on the public internet, with at least one port open to the world; all worker nodes will load data from this node trough HTTP. Note that worker nodes will also need outbound access in order to access this HTTP port. Installation Procedure As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: Ensure the host has a supported operating system Obtain root access to the host Prepare the required Yum repositories Install CA certificates Installing HTCondor Most required software is installed from the Factory RPM installation. HTCondor is the only exception since there are many different ways to install it , using the RPM system or not. You need to have HTCondor installed before installing the GlideinWMS Factory. If yum cannot find a HTCondor RPM, it will install the dummy empty-condor RPM, assuming that you installed HTCondor using a tarball distribution. If you don't have HTCondor already installed, you can install the HTCondor RPM from the OSG repository: root@host # yum install condor.x86_64 Installing HTCondor-BOSCO If you plan to send jobs using direct batch submission (aka BOSCO), then you need also the condor-bosco package. You'll have to install the package and remove one of its files /etc/condor/config.d/60-campus_factory.config because it interferes with the Factory configuration. root@host # yum install condor-bosco root@host # rm /etc/condor/config.d/60-campus_factory.config root@host # touch /etc/condor/config.d/60-campus_factory.config Install GWMS Factory Download and install the Factory RPM Install the RPM and dependencies (be prepared for a lot of dependencies). root@host # yum install glideinwms-factory This will install the current production release verified and tested by OSG with default HTCondor configuration. This command will install the GlideinWMS Factory, HTCondor, the OSG client, and all the required dependencies. If you wish to install a different version of GlideinWMS, add the \"--enablerepo\" argument to the command as follows: yum install --enablerepo=osg-testing glideinwms-factory : The most recent production release, still in testing phase. This will usually match the current tarball version on the GlideinWMS home page . (The osg-release production version may lag behind the tarball release by a few weeks as it is verified and packaged by OSG). Note that this will also take the osg-testing versions of all dependencies as well. yum install --enablerepo=osg-upcoming glideinwms-factory : The most recent development series release, ie version 3.3.x release. This has newer features such as cloud submission support, but is less tested. Download HTCondor tarballs You will need to download HTCondor tarballs for each architecture that you want to deploy pilots on . At this point, GlideinWMS factory does not support pulling HTCondor binaries from your system area. Suggested is that you put these binaries in /var/lib/gwms-factory/condor but any gfactory accessible location should suffice. Configuration Procedure After installing the RPM you need to configure the components of the GlideinWMS Factory: Edit Factory configuration options Edit HTCondor configuration options Create a HTCondor grid map file Reconfigure and Start Factory Configuring the Factory The configuration file is /etc/gwms-factory/glideinWMS.xml . The next steps will describe each line that you will need to edit for most cases, but you may want to review the whole file to be sure that it is configured correctly. Security configuration In the security section, you will need to provide each Frontend that is allowed to communicate with the Factory: security key_length=\"2048\" pub_key=\"RSA\" remove_old_cred_age=\"30\" remove_old_cred_freq=\"24\" reuse_oldkey_onstartup_gracetime=\"900\"> <frontends> <frontend identity= \"vofrontend_service@FACTORY_COLLECTOR_HOSTNAME\" name= \"%ORANGE%vofrontend_sec_name%ENDCOLOR%\" > <security_classes> <security_class name= \"%RED%frontend_sec_class%ENDCOLOR%\" username= \"frontend\" /> </security_classes> </frontend> </frontends> </security> These attributes are very important to get exactly right or the Frontend will not be trusted. This should match one of the factory and security sections of the Frontend configuration Configuring the GlideinWMS Frontend in the following way: Note This is a snippet from the Frontend configuration (for reference), not the Factory that you are configuring now! For the factory section: # from frontend.xml <factory query_expr= '((stringListMember(\"VO\", GLIDEIN_Supported_VOs)))' > .... <collectors> <collector DN= \"/DC=org/DC=doegrids/OU=Services/CN=FACTORY_COLLECTOR_HOSTNAME\" comment= \"Define factory collector globally for simplicity\" factory_identity= \"gfactory@FACTORY_COLLECTOR_HOSTNAME\" my_identity= \"%GREEN%username%ENDCOLOR%@FACTORY_COLLECTOR_HOSTNAME\" node= \"FACTORY_COLLECTOR_HOSTNAME\" /> </collectors> </factory> For the security: # from frontend.xml <security classad_proxy= \"/tmp/vo_proxy\" proxy_DN= \"DN of vo_proxy\" proxy_selection_plugin= \"ProxyAll\" security_name= \"The security name, this is used by factory\" sym_key= \"aes_256_cbc\" > <credentials> <credential absfname= \"/tmp/pilot_proxy\" security_class= \"frontend\" trust_domain= \"OSG\" type= \"grid_proxy\" /> </credentials> </security> Note that the identity of the Frontend must match what HTCondor authenticates the DN of the frontend to. In /etc/condor/certs/condor_mapfile , there must be an entry with vofrontend_service definition (in this case): GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=Services\\/CN\\=Some\\ Name\\ 834323%ENDCOLOR%$\" % GREEN % vofrontend_service % ENDCOLOR % Entry configuration Entries are grid/cloud endpoints (aka Compute Elements, or gatekeepers) that can accept job requests and run pilots (which will run user jobs). Each entry needs to be configured to communicate to a specific gatekeeper. An example test entry is provided in the default GlideinWMS configuration file. At the very least, you will need to modify the entry line: <entry name= \"%RED%ENTRY_NAME%ENDCOLOR%\" enabled= \"True\" auth_method= \"grid_proxy\" trust_domain= \"OSG\" gatekeeper= \"%RED%gatekeeper.domain.tld/jobmanager-type%ENDCOLOR%\" gridtype= \"gt2\" rsl= \"(queue=default)(jobtype=single)\" schedd_name= \"%RED%schedd_glideins2@FACTORY_HOSTNAME%ENDCOLOR%\" verbosity= \"std\" work_dir= \"OSG\" > You will need to modify the entry name and gatekeeper . This will determine the gatekeeper that you access. Specific gatekeepers often require specific \"rsl\" attributes that determine the job queue that you are submitting to, or other attributes. Add them in the rsl attribute. Also, be sure to distribute your entries across the various HTCondor schedd work managers to balance load. To see the available schedd use condor_status -schedd -l | grep Name . Several schedd options are configured by default for you: schedd_glideins2, schedd_glideins3, schedd_glideins4, schedd_glideins5 , as well as the default schedd . This can be modified in the HTCondor configuration. Add any specific options, such as limitations on jobs/pilots or glexec/voms requirements in the entry section below the above line. More details are in the GlideinWMS Factory configuration guide . !!! warning If there is no match between auth_metod and trust_domain of the entry and the type and trust_domain listed in one of the credentials of one of the Frontends using this Factory, then no job can run on that entry. The Factory must advertise the correct Resource Name of each entry for accounting purposes. Then the Factory must also advertise in the entry all the attributes that will allow to match the query expression used in the Frontends connecting to this Factory (e.g. <factory query_expr='((stringListMember(\"%PINK%VO%ENDCOLOR%\", GLIDEIN_Supported_VOs)))'> as explained in the VO frontend configuration document ). Note Keep an eye on this part as we're dealing with singularity. Then you must advertise correctly if the site supports gLExec . If it does not set GLEXEC_BIN to NONE , if gLExec is installed via OSG set it to OSG , otherwise set it to the path of gLExec. For example this snippet advertises GLIDEIN_Supported_VOs attribute with the supported VO so that can be used with the query above in the VO frontend and says that the resource does not support gLExec: <entry name= \"RESOURCE_NAME\" ... <config > ... <attrs> ... <attr name= \"GLIDEIN_Supported_VOs\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%PINK%VO%ENDCOLOR%\" /> <attr name= \"GLEXEC_BIN\" const= \"True\" glidein_publish= \"False\" job_publish= \"False\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%NONE%ENDCOLOR%\" /> <attr name= \"GLIDEIN_Resource_Name\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%SiteNameFromOIM%ENDCOLOR%\" /> </attrs> Note Specially if jobs are sent to OSG resources, it is very important to set the GLIDEIN_Resource_Name and to be consistent with the Resource Name reported in OIM because that name will be used for job accounting in Gratia. It should be the name of the Resource in OIM or the name of the Resource Group (specially if there are many gatekeepers submitting to the same cluster). More information on options can be found here Configuring Tarballs Each pilot will download HTCondor binaries from the staging area. Often, multiple binaries are needed to support various architectures and platforms. Currently, you will need to provide at least one tarball for GlideinWMS to use. (Using the system binaries is currently not supported). Download a HTCondor tarball from here . Suggested is to put the binaries in /var/lib/gwms-factory/condor , but any factory-accessible location will do just fine. Once you have downloaded the tarball, configure it in /etc/gwms-factory/glideinWMS.xml like in the following: <condor_tarball arch= \"default\" base_dir= \"/var/lib/gwms-factory/condor/condor-8.7.6-x86_64_RedHat6-stripped\" os= \"rhel6\" version= \"default\" /> Remember also to modify the condor_os and condor_arch attributes in the entries (the configured Compute Elements) to pick the correct HTCondor binary. Here are more details on using multiple HTCondor binaries. Note that is sufficient to set the base_dir ; the reconfigure command will prepare the tarball and add it to the XML config file. Configuring HTCondor The HTCondor configuration for the Factory is placed in /etc/condor/config.d . 00_gwms_factory_general.config 00-restart_peaceful.config 01_gwms_factory_collectors.config 02_gwms_factory_schedds.config 03_gwms_local.config 10-batch_gahp_blahp.config Get rid of the pre-loaded HTCondor default root@host # rm /etc/condor/config.d/00personal_condor.config root@host # touch /etc/condor/config.d/00personal_condor.config For most installations, the items you need to modify are in 03_gwms_factory_local.config . The lines you will have to edit are: Credentials of the machine. You can either run using a proxy, or a service certificate. It is recommended to use a host certificate and specify its location in the variables GSI_DAEMON_CERT and GSI_DAEMON_KEY . The host certificate should be owned by root and have the correct permissions, 600. HTCondor ids in the form UID.GID (both are integers) HTCondor admin email. Will receive messages when services fail. #-- HTCondor user: condor CONDOR_IDS = #-- Contact (via email) when problems occur CONDOR_ADMIN = ############################ # GSI Security config ############################ #-- Grid Certificate directory GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates #-- Credentials GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem #-- HTCondor mapfile CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile ################################### # Whitelist of HTCondor daemon DNs ################################### #DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD, STARTD Using other HTCondor RPMs, e.g. UW Madison HTCondor RPM The above procedure will work if you are using the OSG HTCondor RPMS. You can verify that you used the OSG HTCondor RPM by using yum list condor . The version name should include \"osg\", e.g. 8.6.9-1.1.osg34.el7 . If you are using the UW Madison HTCondor RPMS, be aware of the following changes: This HTCondor RPM uses a file /etc/condor/condor_config.local to add your local machine slot to the user pool. If you want to disable this behavior (recommended), you should blank out that file or comment out the line in /etc/condor/condor_config for LOCAL_CONFIG_FILE. (Make sure that LOCAL_CONFIG_DIR is set to /etc/condor/config.d ) Note that the variable LOCAL_DIR is set differently in UW Madison and OSG RPMs. This should not cause any more problems in the Glideinwms RPMs, but please take note if you use this variable in your job submissions or other customizations. In general if you are using a non OSG RPM or if you added custom configuration files for HTCondor please check the order of the configuration files: root@host # condor_config_val -config Configuration source: /etc/condor/condor_config Local configuration sources: /etc/condor/config.d/00-restart_peaceful.config /etc/condor/config.d/00_gwms_factory_general.config /etc/condor/config.d/01_gwms_factory_collectors.config /etc/condor/config.d/02_gwms_factory_schedds.config /etc/condor/config.d/03_gwms_local.config /etc/condor/config.d/10-batch_gahp_blahp.config /etc/condor/condor_config.local Restarting HTCondor After configuring HTCondor, be sure to restart HTCondor: root@host # service condor restart Create a HTCondor grid mapfile. The HTCondor grid mapfile /etc/condor/certs/condor_mapfile is used for authentication between the glidein running on a remote worker node, and the local collector. HTCondor uses the mapfile to map certificates to pseudo-users on the local machine. It is important that you map the DN's of each frontend you are talking to. Below is an example mapfile, by default found in /etc/condor/certs/condor_mapfile : GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=People\\/CN\\=Some\\ Name\\ 123456$\" frontend GSI (.*) anonymous FS (.*) \\1 Each frontend needs a line that maps to the user specified in the identity argument in the frontend security section of the Factory configuration. Reconfiguring GlideinWMS After changing the configuration of GlideinWMS and making sure that Factory is running, use the following table to find the appropriate command for your operating system (run as root ): If your operating system is... Run the following command... Enterprise Linux 7 systemctl reload gwms-factory Enterprise Linux 6 service gwms-factory reconfig Note Notice that, in the case of Enterprise Linux 7 systemctl reload gwms-factory will work only if: - gwms-factory service is running - gwms-factory service was started with systemctl Otherwise, you will get the following error in any of the cases: # systemctl reload gwms-factory Job for gwms-factory.service invalid. Upgrading GlideinWMS Before you start the Factory service for the first time or after an update of the RPM or after you change GlideinWMS scripts, you should always use the GlideinWMS \"upgrade\" command. To do so: Make sure the condor and gwms-factory services are stopped (in EL6 this will be done for you). Issue the upgrade command: If you are using Enterprise Linux 7: root@host # /usr/sbin/gwms-factory upgrade If you are using Enterprise Linux 6: root@host # service gwms-factory upgrade Start the condor and gwms-factory services (see next part). Service Activation and Deactivation To start the Factory you must start also HTCondor and the Web server beside the Factory itself: # %RED%For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service condor start root@host # service httpd start root@host # service gwms-factory start # %RED% For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl start condor root@host # systemctl start httpd root@host # systemctl start gwms-factory Note Once you successfully start using the Factory service, anytime you change the /etc/gwms-factory/glideinWMS.xml file you will need to run a reconfig/reload command. If you change also some code you need the upgrade command mentioned above: # %RED% For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service gwms-factory reconfig # %RED% But the situation is a bit more complicated in RHEL 7 , CentOS 7 , and SL7 due to systemd restrictions%ENDCOLOR% # %GREEN% For reconfig:%ENDCOLOR% A. %RED% when the Factory is running%ENDCOLOR% A.1 %RED% without any additional options%ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig%ENDCOLOR% or root@host # systemctl reload gwms-factory A.2 %RED% if you want to give additional options %ENDCOLOR% systemctl stop gwms-factory /usr/sbin/gwms-factory reconfig \"and your options\" systemctl start gwms-factory B. %RED% when the Factory is NOT running %ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig ( \"and your options\" ) To enable the services so that they restart after a reboot: # %RED%# For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # /sbin/chkconfig fetch-crl-cron on root@host # /sbin/chkconfig fetch-crl-boot on root@host # /sbin/chkconfig condor on root@host # /sbin/chkconfig httpd on root@host # /sbin/chkconfig gwms-factory on # %RED%# For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl enable fetch-crl-cron root@host # systemctl enable fetch-crl-boot root@host # systemctl enable condor root@host # systemctl enable httpd root@host # systemctl enable gwms-factory To stop the Factory: # %RED%For RHEL 6 , CentOS 6 , and SL6 %ENDCOLOR% root@host # service gwms-factory stop # %RED%For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl stop gwms-factory And you can stop also the other services if you are not using them independently of the Factory. Validating GlideinWMS Factory The complete validation of the Factory is the submission of actual jobs. You can also check that the services are up and running: root@host # condor_status -any MyType TargetType Name glidefactoryclient None 12345_TEST_ENTRY@gfactory_instance@ glideclient None 12345_TEST_ENTRY@gfactory_instance@ glidefactory None TEST_ENTRY@gfactory_instance@ glidefactoryglobal None gfactory_instance@gfactory_ser glideclientglobal None gfactory_instance@gfactory_ser Scheduler None hostname.fnal.gov DaemonMaster None hostname.fnal.gov Negotiator None hostname.fnal.gov Scheduler None schedd_glideins2@hostname Scheduler None schedd_glideins3@hostname Scheduler None schedd_glideins4@hostname Scheduler None schedd_glideins5@hostname Collector None wmscollector_service@hostname You should have one \"glidefactory\" classAd for each entry that you have enabled. If you have already configured the frontends, you will also have one glidefactoryclient and one glideclient classAd for each frontend / entry. You can check also the monitoring Web page: http://YOUR_HOST_FQDN/factory/monitor/ You can also test the local submission of a job to a resource using the test script local_start.sh but you must first install the OSG client tools and generate a proxy. After that you can run the test (replace ENTRY_NAME with the name of one of the entries in /etc/gwms-factory/glideinWMS.xml ): Check Web server configuration for the monitoring Verify path and specially the URL for the GlideinWMS files served by your web server: stage base_dir = \"/var/lib/gwms-factory/web-area/stage\" use_symlink = \"True\" web_base_url = \"http://HOSTNAME:PORT/factory/stage\" This will determine the location of your web server . Make sure that the URL is visible. Depending on your firewall or the one of your organization, you may need to change the port here and in the httpd configuration (by modifying the \"Listen\" directive in /etc/httpd/conf/httpd.conf ). Note that web servers are an often an attacked piece of infrastruture, so you may want to go through the Apache configuration in /etc/httpd/conf/httpd.conf and disable unneeded modules. Troubleshooting GlideinWMS Factory File Locations File Description File Location Comment Configuration file /etc/gwms-factory/glideinWMS.xml Main configuration file Logs /var/log/gwms-factory/server/factory Overall server logs /var/log/gwms-factory/server/entry_NAME Specific entry logs (generally more useful) /var/log/gwms-factory/client Glidein Pilot logs seperated by user and entry Startup script /etc/init.d/gwms-factory Web Directory /var/lib/gwms-factory/web-area Web Base /var/lib/gwms-factory/web-base Working Directory /var/lib/gwms-factory/work-dir/ Increase the log level and change rotation policies You can increase the log level of the frontend. To add a log file with all the log information add the following line with all the message types in the process_log section of /etc/gwms-factory/glideinWMS.xml : <log_retention> <process_logs> <process_log extension= \"all\" max_days= \"7.0\" max_mbytes= \"100.0\" min_days= \"3.0\" msg_types= \"DEBUG,EXCEPTION,INFO,ERROR,ERR\" /> You can also change the rotation policy and choose whether compress the rotated files, all in the same section of the config files: max_bytes is the max size of the log files max_days it will be rotated. compression specifies if rotated files are compressed backup_count is the number of rotated log files kept Further details are in the reference documentation . Failed authentication errors If you get messages such as these in the logs, the Factory does not trust the frontend and will not submit glideins. WARNING: Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) not in white list. Skipping request This error means that the frontend name in the security section of the Factory does not match the security_name in the frontend. Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) is not coming from a trusted source; AuthenticatedIdentity vofrontend_condor@fermicloud130.fnal.gov!=vofrontend_factory@fermicloud130.fnal.gov. Skipping for security reasons. This error means that the identity in the security section of the Factory does not match what the /etc/condor/certs/condor_mapfile authenticates the Frontend to in HTCondor (!Authenticated Identity in the classad). Make sure the attributes are correctly lined up as in the Frontend security configuration section above. Glideins start but do not connect to User pool / VO Frontend Check the appropriate job err and out logs in /var/log/gwms-factory/client to see if any errors were reported. Often, this will be a pilot unable to access a web server or with an invalid proxy. Also, verify that the condor_mapfile is correct on the VO Frontend's user pool collector and configuration. Glideins start but fail before running job with error \"Proxy not long lived enough\" If the glideins are running on a resource (entry) but the jobs are not running and the log files in /var/log/gwms-factory/client/user_frontend/glidein_gfactory_instance/ENTRY_NAME report an error like \"Proxy not long lived enough (86096 s left), shortened retire time ...\", then probably the HTCondor RLM on the Compute Element is delegating the proxy and shortening its lifespan. This can be fixed by setting DELEGATE_JOB_GSI_CREDENTIALS = FALSE as suggested in the CE install document . References http://glideinwms.fnal.gov/doc.prd/ https://opensciencegrid.org/docs/other/install-gwms-frontend/","title":"Installing GlideinWMS Factory"},{"location":"services/install-gwms-factory/#glideinwms-factory-installation","text":"This document describes how to install a Glidein Workflow Managment System (GlideinWMS) Factory instance. This document assumes expertise with HTCondor and familiarity with the GlideinWMS software. It does not cover anything but the simplest possible install. Please consult the GlideinWMS reference documentation for advanced topics, including non-root, non-RPM-based installation. In this document the terms glidein and pilot (job) will be used interchangeably. This parts covers these primary components of the GlideinWMS system: WMS Collector / Schedd : A set of condor_collector and condor_schedd processes that allow the submission of pilots to Grid entries. GlideinWMS Factory : The process submitting the pilots when needed Warning We really recommend you to use the OSG provided Factory and not to install your own . A VO Frontend is sufficient to submit your jobs and to decide scheduling policies. And this will avoid for you the complexity to deal directly with grid/cloud sites. If you really need you own Factory be aware that it is a complex component and may require a non trivial maintenance effort.","title":"GlideinWMS Factory Installation"},{"location":"services/install-gwms-factory/#before-starting","text":"Before starting the installation process, consider the following points (consulting the Reference section below as needed):","title":"Before Starting"},{"location":"services/install-gwms-factory/#requirements","text":"","title":"Requirements"},{"location":"services/install-gwms-factory/#host-and-os","text":"A host to install the GlideinWMS Factory (pristine node). Currently most of our testing has been done on Scientific Linux 6 and 7. Root access The GlideinWMS Factory has the following requirements: CPU : 4-8 cores for a large installation (1 should suffice on a small install) RAM : 4-8GB on a large installation (1GB should suffice for small installs) Disk : 10GB will be plenty sufficient for all the binaries, config and log files related to GlideinWMS. If you are a large site with need to keep significant history and logs, you may want to allocate 100GB+ to store long histories.","title":"Host and OS"},{"location":"services/install-gwms-factory/#users","text":"The GlideinWMS Factory installation will create the following users unless they are already created . User Default uid Comment condor none HTCondor user (installed via dependencies). gfactory none This user runs the GlideinWMS VO factory. To verify that the user gfactory has gfactory as primary group check the output of root@host # getent passwd gfactory | cut -d: -f4 | xargs getent group It should be the gfactory group.","title":"Users"},{"location":"services/install-gwms-factory/#certificates","text":"Certificate User that owns certificate Path to certificate Host certificate root /etc/grid-security/hostcert.pem /etc/grid-security/hostkey.pem Here are instructions to request a host certificate. The host certificate/key is used for authorization, however, authorization between the Factory and the GlideinWMS collector is done by file system authentication.","title":"Certificates"},{"location":"services/install-gwms-factory/#networking","text":"","title":"Networking"},{"location":"services/install-gwms-factory/#firewalls","text":"It must be on the public internet, with at least one port open to the world; all worker nodes will load data from this node trough HTTP. Note that worker nodes will also need outbound access in order to access this HTTP port.","title":"Firewalls"},{"location":"services/install-gwms-factory/#installation-procedure","text":"As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: Ensure the host has a supported operating system Obtain root access to the host Prepare the required Yum repositories Install CA certificates","title":"Installation Procedure"},{"location":"services/install-gwms-factory/#installing-htcondor","text":"Most required software is installed from the Factory RPM installation. HTCondor is the only exception since there are many different ways to install it , using the RPM system or not. You need to have HTCondor installed before installing the GlideinWMS Factory. If yum cannot find a HTCondor RPM, it will install the dummy empty-condor RPM, assuming that you installed HTCondor using a tarball distribution. If you don't have HTCondor already installed, you can install the HTCondor RPM from the OSG repository: root@host # yum install condor.x86_64","title":"Installing HTCondor"},{"location":"services/install-gwms-factory/#installing-htcondor-bosco","text":"If you plan to send jobs using direct batch submission (aka BOSCO), then you need also the condor-bosco package. You'll have to install the package and remove one of its files /etc/condor/config.d/60-campus_factory.config because it interferes with the Factory configuration. root@host # yum install condor-bosco root@host # rm /etc/condor/config.d/60-campus_factory.config root@host # touch /etc/condor/config.d/60-campus_factory.config","title":"Installing HTCondor-BOSCO"},{"location":"services/install-gwms-factory/#install-gwms-factory","text":"","title":"Install GWMS Factory"},{"location":"services/install-gwms-factory/#download-and-install-the-factory-rpm","text":"Install the RPM and dependencies (be prepared for a lot of dependencies). root@host # yum install glideinwms-factory This will install the current production release verified and tested by OSG with default HTCondor configuration. This command will install the GlideinWMS Factory, HTCondor, the OSG client, and all the required dependencies. If you wish to install a different version of GlideinWMS, add the \"--enablerepo\" argument to the command as follows: yum install --enablerepo=osg-testing glideinwms-factory : The most recent production release, still in testing phase. This will usually match the current tarball version on the GlideinWMS home page . (The osg-release production version may lag behind the tarball release by a few weeks as it is verified and packaged by OSG). Note that this will also take the osg-testing versions of all dependencies as well. yum install --enablerepo=osg-upcoming glideinwms-factory : The most recent development series release, ie version 3.3.x release. This has newer features such as cloud submission support, but is less tested.","title":"Download and install the Factory RPM"},{"location":"services/install-gwms-factory/#download-htcondor-tarballs","text":"You will need to download HTCondor tarballs for each architecture that you want to deploy pilots on . At this point, GlideinWMS factory does not support pulling HTCondor binaries from your system area. Suggested is that you put these binaries in /var/lib/gwms-factory/condor but any gfactory accessible location should suffice.","title":"Download HTCondor tarballs"},{"location":"services/install-gwms-factory/#configuration-procedure","text":"After installing the RPM you need to configure the components of the GlideinWMS Factory: Edit Factory configuration options Edit HTCondor configuration options Create a HTCondor grid map file Reconfigure and Start Factory","title":"Configuration Procedure"},{"location":"services/install-gwms-factory/#configuring-the-factory","text":"The configuration file is /etc/gwms-factory/glideinWMS.xml . The next steps will describe each line that you will need to edit for most cases, but you may want to review the whole file to be sure that it is configured correctly.","title":"Configuring the Factory"},{"location":"services/install-gwms-factory/#security-configuration","text":"In the security section, you will need to provide each Frontend that is allowed to communicate with the Factory: security key_length=\"2048\" pub_key=\"RSA\" remove_old_cred_age=\"30\" remove_old_cred_freq=\"24\" reuse_oldkey_onstartup_gracetime=\"900\"> <frontends> <frontend identity= \"vofrontend_service@FACTORY_COLLECTOR_HOSTNAME\" name= \"%ORANGE%vofrontend_sec_name%ENDCOLOR%\" > <security_classes> <security_class name= \"%RED%frontend_sec_class%ENDCOLOR%\" username= \"frontend\" /> </security_classes> </frontend> </frontends> </security> These attributes are very important to get exactly right or the Frontend will not be trusted. This should match one of the factory and security sections of the Frontend configuration Configuring the GlideinWMS Frontend in the following way: Note This is a snippet from the Frontend configuration (for reference), not the Factory that you are configuring now! For the factory section: # from frontend.xml <factory query_expr= '((stringListMember(\"VO\", GLIDEIN_Supported_VOs)))' > .... <collectors> <collector DN= \"/DC=org/DC=doegrids/OU=Services/CN=FACTORY_COLLECTOR_HOSTNAME\" comment= \"Define factory collector globally for simplicity\" factory_identity= \"gfactory@FACTORY_COLLECTOR_HOSTNAME\" my_identity= \"%GREEN%username%ENDCOLOR%@FACTORY_COLLECTOR_HOSTNAME\" node= \"FACTORY_COLLECTOR_HOSTNAME\" /> </collectors> </factory> For the security: # from frontend.xml <security classad_proxy= \"/tmp/vo_proxy\" proxy_DN= \"DN of vo_proxy\" proxy_selection_plugin= \"ProxyAll\" security_name= \"The security name, this is used by factory\" sym_key= \"aes_256_cbc\" > <credentials> <credential absfname= \"/tmp/pilot_proxy\" security_class= \"frontend\" trust_domain= \"OSG\" type= \"grid_proxy\" /> </credentials> </security> Note that the identity of the Frontend must match what HTCondor authenticates the DN of the frontend to. In /etc/condor/certs/condor_mapfile , there must be an entry with vofrontend_service definition (in this case): GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=Services\\/CN\\=Some\\ Name\\ 834323%ENDCOLOR%$\" % GREEN % vofrontend_service % ENDCOLOR %","title":"Security configuration"},{"location":"services/install-gwms-factory/#entry-configuration","text":"Entries are grid/cloud endpoints (aka Compute Elements, or gatekeepers) that can accept job requests and run pilots (which will run user jobs). Each entry needs to be configured to communicate to a specific gatekeeper. An example test entry is provided in the default GlideinWMS configuration file. At the very least, you will need to modify the entry line: <entry name= \"%RED%ENTRY_NAME%ENDCOLOR%\" enabled= \"True\" auth_method= \"grid_proxy\" trust_domain= \"OSG\" gatekeeper= \"%RED%gatekeeper.domain.tld/jobmanager-type%ENDCOLOR%\" gridtype= \"gt2\" rsl= \"(queue=default)(jobtype=single)\" schedd_name= \"%RED%schedd_glideins2@FACTORY_HOSTNAME%ENDCOLOR%\" verbosity= \"std\" work_dir= \"OSG\" > You will need to modify the entry name and gatekeeper . This will determine the gatekeeper that you access. Specific gatekeepers often require specific \"rsl\" attributes that determine the job queue that you are submitting to, or other attributes. Add them in the rsl attribute. Also, be sure to distribute your entries across the various HTCondor schedd work managers to balance load. To see the available schedd use condor_status -schedd -l | grep Name . Several schedd options are configured by default for you: schedd_glideins2, schedd_glideins3, schedd_glideins4, schedd_glideins5 , as well as the default schedd . This can be modified in the HTCondor configuration. Add any specific options, such as limitations on jobs/pilots or glexec/voms requirements in the entry section below the above line. More details are in the GlideinWMS Factory configuration guide . !!! warning If there is no match between auth_metod and trust_domain of the entry and the type and trust_domain listed in one of the credentials of one of the Frontends using this Factory, then no job can run on that entry. The Factory must advertise the correct Resource Name of each entry for accounting purposes. Then the Factory must also advertise in the entry all the attributes that will allow to match the query expression used in the Frontends connecting to this Factory (e.g. <factory query_expr='((stringListMember(\"%PINK%VO%ENDCOLOR%\", GLIDEIN_Supported_VOs)))'> as explained in the VO frontend configuration document ). Note Keep an eye on this part as we're dealing with singularity. Then you must advertise correctly if the site supports gLExec . If it does not set GLEXEC_BIN to NONE , if gLExec is installed via OSG set it to OSG , otherwise set it to the path of gLExec. For example this snippet advertises GLIDEIN_Supported_VOs attribute with the supported VO so that can be used with the query above in the VO frontend and says that the resource does not support gLExec: <entry name= \"RESOURCE_NAME\" ... <config > ... <attrs> ... <attr name= \"GLIDEIN_Supported_VOs\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%PINK%VO%ENDCOLOR%\" /> <attr name= \"GLEXEC_BIN\" const= \"True\" glidein_publish= \"False\" job_publish= \"False\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%NONE%ENDCOLOR%\" /> <attr name= \"GLIDEIN_Resource_Name\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%SiteNameFromOIM%ENDCOLOR%\" /> </attrs> Note Specially if jobs are sent to OSG resources, it is very important to set the GLIDEIN_Resource_Name and to be consistent with the Resource Name reported in OIM because that name will be used for job accounting in Gratia. It should be the name of the Resource in OIM or the name of the Resource Group (specially if there are many gatekeepers submitting to the same cluster). More information on options can be found here","title":"Entry configuration"},{"location":"services/install-gwms-factory/#configuring-tarballs","text":"Each pilot will download HTCondor binaries from the staging area. Often, multiple binaries are needed to support various architectures and platforms. Currently, you will need to provide at least one tarball for GlideinWMS to use. (Using the system binaries is currently not supported). Download a HTCondor tarball from here . Suggested is to put the binaries in /var/lib/gwms-factory/condor , but any factory-accessible location will do just fine. Once you have downloaded the tarball, configure it in /etc/gwms-factory/glideinWMS.xml like in the following: <condor_tarball arch= \"default\" base_dir= \"/var/lib/gwms-factory/condor/condor-8.7.6-x86_64_RedHat6-stripped\" os= \"rhel6\" version= \"default\" /> Remember also to modify the condor_os and condor_arch attributes in the entries (the configured Compute Elements) to pick the correct HTCondor binary. Here are more details on using multiple HTCondor binaries. Note that is sufficient to set the base_dir ; the reconfigure command will prepare the tarball and add it to the XML config file.","title":"Configuring Tarballs"},{"location":"services/install-gwms-factory/#configuring-htcondor","text":"The HTCondor configuration for the Factory is placed in /etc/condor/config.d . 00_gwms_factory_general.config 00-restart_peaceful.config 01_gwms_factory_collectors.config 02_gwms_factory_schedds.config 03_gwms_local.config 10-batch_gahp_blahp.config Get rid of the pre-loaded HTCondor default root@host # rm /etc/condor/config.d/00personal_condor.config root@host # touch /etc/condor/config.d/00personal_condor.config For most installations, the items you need to modify are in 03_gwms_factory_local.config . The lines you will have to edit are: Credentials of the machine. You can either run using a proxy, or a service certificate. It is recommended to use a host certificate and specify its location in the variables GSI_DAEMON_CERT and GSI_DAEMON_KEY . The host certificate should be owned by root and have the correct permissions, 600. HTCondor ids in the form UID.GID (both are integers) HTCondor admin email. Will receive messages when services fail. #-- HTCondor user: condor CONDOR_IDS = #-- Contact (via email) when problems occur CONDOR_ADMIN = ############################ # GSI Security config ############################ #-- Grid Certificate directory GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates #-- Credentials GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem #-- HTCondor mapfile CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile ################################### # Whitelist of HTCondor daemon DNs ################################### #DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD, STARTD","title":"Configuring HTCondor"},{"location":"services/install-gwms-factory/#using-other-htcondor-rpms-eg-uw-madison-htcondor-rpm","text":"The above procedure will work if you are using the OSG HTCondor RPMS. You can verify that you used the OSG HTCondor RPM by using yum list condor . The version name should include \"osg\", e.g. 8.6.9-1.1.osg34.el7 . If you are using the UW Madison HTCondor RPMS, be aware of the following changes: This HTCondor RPM uses a file /etc/condor/condor_config.local to add your local machine slot to the user pool. If you want to disable this behavior (recommended), you should blank out that file or comment out the line in /etc/condor/condor_config for LOCAL_CONFIG_FILE. (Make sure that LOCAL_CONFIG_DIR is set to /etc/condor/config.d ) Note that the variable LOCAL_DIR is set differently in UW Madison and OSG RPMs. This should not cause any more problems in the Glideinwms RPMs, but please take note if you use this variable in your job submissions or other customizations. In general if you are using a non OSG RPM or if you added custom configuration files for HTCondor please check the order of the configuration files: root@host # condor_config_val -config Configuration source: /etc/condor/condor_config Local configuration sources: /etc/condor/config.d/00-restart_peaceful.config /etc/condor/config.d/00_gwms_factory_general.config /etc/condor/config.d/01_gwms_factory_collectors.config /etc/condor/config.d/02_gwms_factory_schedds.config /etc/condor/config.d/03_gwms_local.config /etc/condor/config.d/10-batch_gahp_blahp.config /etc/condor/condor_config.local","title":"Using other HTCondor RPMs, e.g. UW Madison HTCondor RPM"},{"location":"services/install-gwms-factory/#restarting-htcondor","text":"After configuring HTCondor, be sure to restart HTCondor: root@host # service condor restart","title":"Restarting HTCondor"},{"location":"services/install-gwms-factory/#create-a-htcondor-grid-mapfile","text":"The HTCondor grid mapfile /etc/condor/certs/condor_mapfile is used for authentication between the glidein running on a remote worker node, and the local collector. HTCondor uses the mapfile to map certificates to pseudo-users on the local machine. It is important that you map the DN's of each frontend you are talking to. Below is an example mapfile, by default found in /etc/condor/certs/condor_mapfile : GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=People\\/CN\\=Some\\ Name\\ 123456$\" frontend GSI (.*) anonymous FS (.*) \\1 Each frontend needs a line that maps to the user specified in the identity argument in the frontend security section of the Factory configuration.","title":"Create a HTCondor grid mapfile."},{"location":"services/install-gwms-factory/#reconfiguring-glideinwms","text":"After changing the configuration of GlideinWMS and making sure that Factory is running, use the following table to find the appropriate command for your operating system (run as root ): If your operating system is... Run the following command... Enterprise Linux 7 systemctl reload gwms-factory Enterprise Linux 6 service gwms-factory reconfig Note Notice that, in the case of Enterprise Linux 7 systemctl reload gwms-factory will work only if: - gwms-factory service is running - gwms-factory service was started with systemctl Otherwise, you will get the following error in any of the cases: # systemctl reload gwms-factory Job for gwms-factory.service invalid.","title":"Reconfiguring GlideinWMS"},{"location":"services/install-gwms-factory/#upgrading-glideinwms","text":"Before you start the Factory service for the first time or after an update of the RPM or after you change GlideinWMS scripts, you should always use the GlideinWMS \"upgrade\" command. To do so: Make sure the condor and gwms-factory services are stopped (in EL6 this will be done for you). Issue the upgrade command: If you are using Enterprise Linux 7: root@host # /usr/sbin/gwms-factory upgrade If you are using Enterprise Linux 6: root@host # service gwms-factory upgrade Start the condor and gwms-factory services (see next part).","title":"Upgrading GlideinWMS"},{"location":"services/install-gwms-factory/#service-activation-and-deactivation","text":"To start the Factory you must start also HTCondor and the Web server beside the Factory itself: # %RED%For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service condor start root@host # service httpd start root@host # service gwms-factory start # %RED% For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl start condor root@host # systemctl start httpd root@host # systemctl start gwms-factory Note Once you successfully start using the Factory service, anytime you change the /etc/gwms-factory/glideinWMS.xml file you will need to run a reconfig/reload command. If you change also some code you need the upgrade command mentioned above: # %RED% For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service gwms-factory reconfig # %RED% But the situation is a bit more complicated in RHEL 7 , CentOS 7 , and SL7 due to systemd restrictions%ENDCOLOR% # %GREEN% For reconfig:%ENDCOLOR% A. %RED% when the Factory is running%ENDCOLOR% A.1 %RED% without any additional options%ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig%ENDCOLOR% or root@host # systemctl reload gwms-factory A.2 %RED% if you want to give additional options %ENDCOLOR% systemctl stop gwms-factory /usr/sbin/gwms-factory reconfig \"and your options\" systemctl start gwms-factory B. %RED% when the Factory is NOT running %ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig ( \"and your options\" ) To enable the services so that they restart after a reboot: # %RED%# For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # /sbin/chkconfig fetch-crl-cron on root@host # /sbin/chkconfig fetch-crl-boot on root@host # /sbin/chkconfig condor on root@host # /sbin/chkconfig httpd on root@host # /sbin/chkconfig gwms-factory on # %RED%# For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl enable fetch-crl-cron root@host # systemctl enable fetch-crl-boot root@host # systemctl enable condor root@host # systemctl enable httpd root@host # systemctl enable gwms-factory To stop the Factory: # %RED%For RHEL 6 , CentOS 6 , and SL6 %ENDCOLOR% root@host # service gwms-factory stop # %RED%For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl stop gwms-factory And you can stop also the other services if you are not using them independently of the Factory.","title":"Service Activation and Deactivation"},{"location":"services/install-gwms-factory/#validating-glideinwms-factory","text":"The complete validation of the Factory is the submission of actual jobs. You can also check that the services are up and running: root@host # condor_status -any MyType TargetType Name glidefactoryclient None 12345_TEST_ENTRY@gfactory_instance@ glideclient None 12345_TEST_ENTRY@gfactory_instance@ glidefactory None TEST_ENTRY@gfactory_instance@ glidefactoryglobal None gfactory_instance@gfactory_ser glideclientglobal None gfactory_instance@gfactory_ser Scheduler None hostname.fnal.gov DaemonMaster None hostname.fnal.gov Negotiator None hostname.fnal.gov Scheduler None schedd_glideins2@hostname Scheduler None schedd_glideins3@hostname Scheduler None schedd_glideins4@hostname Scheduler None schedd_glideins5@hostname Collector None wmscollector_service@hostname You should have one \"glidefactory\" classAd for each entry that you have enabled. If you have already configured the frontends, you will also have one glidefactoryclient and one glideclient classAd for each frontend / entry. You can check also the monitoring Web page: http://YOUR_HOST_FQDN/factory/monitor/ You can also test the local submission of a job to a resource using the test script local_start.sh but you must first install the OSG client tools and generate a proxy. After that you can run the test (replace ENTRY_NAME with the name of one of the entries in /etc/gwms-factory/glideinWMS.xml ):","title":"Validating GlideinWMS Factory"},{"location":"services/install-gwms-factory/#check-web-server-configuration-for-the-monitoring","text":"Verify path and specially the URL for the GlideinWMS files served by your web server: stage base_dir = \"/var/lib/gwms-factory/web-area/stage\" use_symlink = \"True\" web_base_url = \"http://HOSTNAME:PORT/factory/stage\" This will determine the location of your web server . Make sure that the URL is visible. Depending on your firewall or the one of your organization, you may need to change the port here and in the httpd configuration (by modifying the \"Listen\" directive in /etc/httpd/conf/httpd.conf ). Note that web servers are an often an attacked piece of infrastruture, so you may want to go through the Apache configuration in /etc/httpd/conf/httpd.conf and disable unneeded modules.","title":"Check Web server configuration for the monitoring"},{"location":"services/install-gwms-factory/#troubleshooting-glideinwms-factory","text":"","title":"Troubleshooting GlideinWMS Factory"},{"location":"services/install-gwms-factory/#file-locations","text":"File Description File Location Comment Configuration file /etc/gwms-factory/glideinWMS.xml Main configuration file Logs /var/log/gwms-factory/server/factory Overall server logs /var/log/gwms-factory/server/entry_NAME Specific entry logs (generally more useful) /var/log/gwms-factory/client Glidein Pilot logs seperated by user and entry Startup script /etc/init.d/gwms-factory Web Directory /var/lib/gwms-factory/web-area Web Base /var/lib/gwms-factory/web-base Working Directory /var/lib/gwms-factory/work-dir/","title":"File Locations"},{"location":"services/install-gwms-factory/#increase-the-log-level-and-change-rotation-policies","text":"You can increase the log level of the frontend. To add a log file with all the log information add the following line with all the message types in the process_log section of /etc/gwms-factory/glideinWMS.xml : <log_retention> <process_logs> <process_log extension= \"all\" max_days= \"7.0\" max_mbytes= \"100.0\" min_days= \"3.0\" msg_types= \"DEBUG,EXCEPTION,INFO,ERROR,ERR\" /> You can also change the rotation policy and choose whether compress the rotated files, all in the same section of the config files: max_bytes is the max size of the log files max_days it will be rotated. compression specifies if rotated files are compressed backup_count is the number of rotated log files kept Further details are in the reference documentation .","title":"Increase the log level and change rotation policies"},{"location":"services/install-gwms-factory/#failed-authentication-errors","text":"If you get messages such as these in the logs, the Factory does not trust the frontend and will not submit glideins. WARNING: Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) not in white list. Skipping request This error means that the frontend name in the security section of the Factory does not match the security_name in the frontend. Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) is not coming from a trusted source; AuthenticatedIdentity vofrontend_condor@fermicloud130.fnal.gov!=vofrontend_factory@fermicloud130.fnal.gov. Skipping for security reasons. This error means that the identity in the security section of the Factory does not match what the /etc/condor/certs/condor_mapfile authenticates the Frontend to in HTCondor (!Authenticated Identity in the classad). Make sure the attributes are correctly lined up as in the Frontend security configuration section above.","title":"Failed authentication errors"},{"location":"services/install-gwms-factory/#glideins-start-but-do-not-connect-to-user-pool-vo-frontend","text":"Check the appropriate job err and out logs in /var/log/gwms-factory/client to see if any errors were reported. Often, this will be a pilot unable to access a web server or with an invalid proxy. Also, verify that the condor_mapfile is correct on the VO Frontend's user pool collector and configuration.","title":"Glideins start but do not connect to User pool / VO Frontend"},{"location":"services/install-gwms-factory/#glideins-start-but-fail-before-running-job-with-error-proxy-not-long-lived-enough","text":"If the glideins are running on a resource (entry) but the jobs are not running and the log files in /var/log/gwms-factory/client/user_frontend/glidein_gfactory_instance/ENTRY_NAME report an error like \"Proxy not long lived enough (86096 s left), shortened retire time ...\", then probably the HTCondor RLM on the Compute Element is delegating the proxy and shortening its lifespan. This can be fixed by setting DELEGATE_JOB_GSI_CREDENTIALS = FALSE as suggested in the CE install document .","title":"Glideins start but fail before running job with error \"Proxy not long lived enough\""},{"location":"services/install-gwms-factory/#references","text":"http://glideinwms.fnal.gov/doc.prd/ https://opensciencegrid.org/docs/other/install-gwms-frontend/","title":"References"},{"location":"services/sending-announcements/","text":"Sending Announcements Various OSG teams need to send out announcement about various events (releases, security advisories, planned changes, etc). This page describes how to send announcements using the osg-notify tool. Prerequisites To send announcements, the following conditions must be met: A host with an IP address listed in the SPF Record A sufficiently modern Linux operating system. This procedure has been tested on a FermiCloud Scientific Linux 7 VM and a Linux Mint 18.3 laptop. It is known not to work on a FermiCloud Scientific Linux 6 VM. A valid OSG user certificate to lookup contacts in the topology database Local hostname matches DNS DNS forward and reverse lookups in place [tim@submit-1 topology]$ hostname submit-1.chtc.wisc.edu [tim@submit-1 topology]$ host submit-1.chtc.wisc.edu submit-1.chtc.wisc.edu has address 128.105.244.191 [tim@submit-1 topology]$ host 128 .105.244.191 191.244.105.128.in-addr.arpa domain name pointer submit-1.chtc.wisc.edu. (Required for security announcements) A GPG Key to sign the announcement Installation Install the required Yum repositories : Install the OSG tools: # yum install --enablerepo = devops topology-client If you are on a FermiCloud VM, update postfix to relay through FermiLab's official mail server: echo \"transport_maps = hash:/etc/postfix/transport\" >> /etc/postfix/main.cf echo \"* smtp:smtp.fnal.gov\" >> /etc/postfix/transport postmap hash:/etc/postfix/transport postfix reload Test this setup by sending a message to yourself only. Bonus points for using an email address that goes to a site with aggressive SPAM filtering. Sending the announcement Use the osg-notify tool to send the announcement using the relevant options from the following table: Option Description --dry-run Use this option until you are ready to actually send the message --cert <FILE> File that contains your OSG User Certificate --key <FILE> File that contains your Private Key for your OSG User Certificate --no-sign Don't GPG sign the message (release only) --type production Not a test message --message <FILE> File containing your message --subject <EMAIL SUBJECT> The subject of your message --recipients <LIST OF EMAILS> List of recipient email addresses, must have at least one --oim-recipients <resources|vos> Select contacts associated with resources and/or VOs --oim-contact-type <TYPE> Replacing <TYPE> with administrative for release announcements or security for security announcements --bypass-dns-check Use this option to skip the check that one of the host's IP addresses matches with the hostname resolution Security requirements Security announcements must be signed using the following options: --sign : GPG sign the message --sign-id <KEYID> : The ID of the key used for singing --from security : The mail comes from the OSG Security Team For release announcements use the following command: osg-notify --cert your-cert.pem --key your-key.pem \\ --no-sign --type production --message <PATH TO MESSAGE FILE> \\ --subject '<EMAIL SUBJECT>' \\ --recipients \"osg-general@opensciencegrid.org osg-operations@opensciencegrid.org osg-sites@opensciencegrid.org vdt-discuss@opensciencegrid.org\" \\ --oim-recipients resources --oim-recipients vos --oim-contact-type administrative Replacing <EMAIL SUBJECT> with an appropriate subject for your announcement and <PATH TO MESSAGE FILE> with the path to the file containing your message in plain text.","title":"Sending Announcements"},{"location":"services/sending-announcements/#sending-announcements","text":"Various OSG teams need to send out announcement about various events (releases, security advisories, planned changes, etc). This page describes how to send announcements using the osg-notify tool.","title":"Sending Announcements"},{"location":"services/sending-announcements/#prerequisites","text":"To send announcements, the following conditions must be met: A host with an IP address listed in the SPF Record A sufficiently modern Linux operating system. This procedure has been tested on a FermiCloud Scientific Linux 7 VM and a Linux Mint 18.3 laptop. It is known not to work on a FermiCloud Scientific Linux 6 VM. A valid OSG user certificate to lookup contacts in the topology database Local hostname matches DNS DNS forward and reverse lookups in place [tim@submit-1 topology]$ hostname submit-1.chtc.wisc.edu [tim@submit-1 topology]$ host submit-1.chtc.wisc.edu submit-1.chtc.wisc.edu has address 128.105.244.191 [tim@submit-1 topology]$ host 128 .105.244.191 191.244.105.128.in-addr.arpa domain name pointer submit-1.chtc.wisc.edu. (Required for security announcements) A GPG Key to sign the announcement","title":"Prerequisites"},{"location":"services/sending-announcements/#installation","text":"Install the required Yum repositories : Install the OSG tools: # yum install --enablerepo = devops topology-client If you are on a FermiCloud VM, update postfix to relay through FermiLab's official mail server: echo \"transport_maps = hash:/etc/postfix/transport\" >> /etc/postfix/main.cf echo \"* smtp:smtp.fnal.gov\" >> /etc/postfix/transport postmap hash:/etc/postfix/transport postfix reload Test this setup by sending a message to yourself only. Bonus points for using an email address that goes to a site with aggressive SPAM filtering.","title":"Installation"},{"location":"services/sending-announcements/#sending-the-announcement","text":"Use the osg-notify tool to send the announcement using the relevant options from the following table: Option Description --dry-run Use this option until you are ready to actually send the message --cert <FILE> File that contains your OSG User Certificate --key <FILE> File that contains your Private Key for your OSG User Certificate --no-sign Don't GPG sign the message (release only) --type production Not a test message --message <FILE> File containing your message --subject <EMAIL SUBJECT> The subject of your message --recipients <LIST OF EMAILS> List of recipient email addresses, must have at least one --oim-recipients <resources|vos> Select contacts associated with resources and/or VOs --oim-contact-type <TYPE> Replacing <TYPE> with administrative for release announcements or security for security announcements --bypass-dns-check Use this option to skip the check that one of the host's IP addresses matches with the hostname resolution Security requirements Security announcements must be signed using the following options: --sign : GPG sign the message --sign-id <KEYID> : The ID of the key used for singing --from security : The mail comes from the OSG Security Team For release announcements use the following command: osg-notify --cert your-cert.pem --key your-key.pem \\ --no-sign --type production --message <PATH TO MESSAGE FILE> \\ --subject '<EMAIL SUBJECT>' \\ --recipients \"osg-general@opensciencegrid.org osg-operations@opensciencegrid.org osg-sites@opensciencegrid.org vdt-discuss@opensciencegrid.org\" \\ --oim-recipients resources --oim-recipients vos --oim-contact-type administrative Replacing <EMAIL SUBJECT> with an appropriate subject for your announcement and <PATH TO MESSAGE FILE> with the path to the file containing your message in plain text.","title":"Sending the announcement"},{"location":"services/topology-contacts-data/","text":"Topology and Contacts Data This is internal documentation intended for OSG Operations staff. It contains information about the data provided by https://topology.opensciencegrid.org . The topology data for the service is in https://github.com/opensciencegrid/topology , in the projects/ , topology/ , and virtual-organizations/ subdirectories. The contacts data is in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . Topology Data Admins may request changes to data in the topology repo via either a GitHub pull request or a Freshdesk ticket. These changes can be to a project, a VO, or a resource. The registration document and topology README document should tell them how to do that. In the case of a GitHub pull request, you will need to provide IDs using the bin/next_ids tool in an up-to-date local clone of Topology and potentially fix-up other data. To assist the user, do one of the following, depending on the severity of the fixes required for the PR: For minor issues, submit a \"Comment\" review using GitHub suggestions and ask the user to incorporate your suggestions . For major issues, create a branch based off of their PR, make changes, and submit your own PR that closes the original user's PR. The CI checks should catch most errors but you should still review the YAML changes. Certain things to check are: Do contact names and IDs match what's in the contacts data? (See below for instructions on how to get that information.) If the person is not in the contacts data, you will need to add them before approving the PR. Is the PR submitter authorized to make changes to that project/VO/resource? Can you match them to a person affiliated with that project/VO/site? (The contacts data now includes the GitHub usernames for some people. See below for instructions on how to get that information.) Is their GitHub ID registered in the contact database and are they associated with the relevant resource, site, facility, or VO? Retiring resources A resource can be disabled in its topology yaml file by setting Active: false . However the resource entry should not be immediately deleted from the yaml file. One reason for this is that the WLCG accounting info configured for resources is used to determine which resources to send APEL numbers for. Removing resources prematurely could prevent resummarized GRACC data from getting sent appropriately. Resources that have been inactive for at least two years are eligible to be deleted from the topology database. The GRACC records for this resource can be inspected in Kibana . In the search bar, enter ProbeName:*\\:FQDN in the search bar, where FQDN is the FQDN defined for your resource For example, if your resource FQDN is cmsgrid01.hep.wisc.edu you would enter ProbeName:*\\:cmsgrid01.hep.wisc.edu In the upper-right corner, use the Time Range selection to pick \"Last 2 years\" With this criteria selected, Kibana will show you if it has received any records for this resource in the past two years. If there are no records returned, you may remove the resource from the resource group yaml file in the topology repo. Any downtime entries for this resource in the corresponding downtime yaml file for the resource group must be removed also. If you remove the last resource in the resource group yaml file, you should remove the resource group and corresponding downtime yaml files as well. Reviewing project PRs New projects are typically created by the Research Facilitation team. Here are a few things to check: Did osg-bot warn about a \"New Organization\"? If so, search around in the projects directory and make sure the \"Organization\" in the YAML is not a typo or alternate spelling for an existing organization. grep around in the /projects/ directory for substrings of the organization. For example, if the new org is \"University of Wisconsin Madison\", do: $ grep -i wisconsin projects/*.yaml and you will see that it's supposed to be \"University of Wisconsin-Madison\". If the new organization is not a typo or alternate spelling, dismiss osg-bot's review with the comment \"new org is legit\". - If osg-bot included a message about \"Unrecognized InstitutionID\" alongside this warning, check that the \"InstitutionID\" field in the project contains an ID that's found in the OSG institutions database . The topology project web-form should automatically populate the InstitutionID field for known institutions, but may fail in the case of spelling discrepancies. - If the organization is absent from the institutions database, add a comment to the pull request mentioning @opensciencegrid/project-office to request permission to add the institution. Once you have obtained permission, search for the institution's canonical name in the Research Organization Registry before adding it to the database. - Is the project name is of the form <INSTITUTION>_<PINAME> , e.g. UWMadison_Parks ? (This is recommended but not required for new projects.) If so: Is the short name -> organization mapping for the institution in /mappings/project_institution.yaml (e.g. UWMadison: \"University of Wisconsin-Madison\" )? If not, ask the PR author to add it. Does the \"FieldOfScience\" in the YAML match one of the keys in /mappings/nsfscience.yaml ? (The list is also available on the left column of this CSV .) Is the \"Sponsor\" correct? The sponsor depends on where the users will be submitting jobs from: If they primarily submit from some CI Connect interface such as \"OSG Connect\", use: Sponsor : CampusGrid : Name : <CAMPUS_GRID> The campus grid name must be one of the ones in the /projects/_CAMPUS_GRIDS.yaml file.. Otherwise, the project must be sponsored by a VO: Sponsor : VirtualOrganization : Name : <VO> The VO name must be one of the ones in the /virtual-organizations/ dir. Contacts Data The OSG keeps contact data for administrators and maintainers of OSG resources and VOs for the purpose of distributing security, software, and adminstrative (e.g., OSG All-Hands dates) announcements. Additionally, OSG contacts have the following abilities: View other contacts' information (via HTML and XML ) with a registered certificate Register resource downtimes for resources that they are listed as an administrative contact, if they have a registered GitHub ID Contact data is kept as editable YAML in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . The YAML file contains sensitive information and is only visible to people with access to that repo. Getting access to the contact repo The contacts repo is hosted on BitBucket. You will need an Atlassian account for access to BitBucket. The account you use for OSG JIRA should work. Once you have an account, request access from Brian Lin, Mat Selmeci, or Derek Weitzel. You should then be able to go to https://bitbucket.org/opensciencegrid/contact/ . Using the contact repo BitBucket is similar to GitHub except you don't make a fork of the contact repo, you just clone it to your local machine. This means that any pushes go directly to the main repo instead of your own fork. Danger Don't push to master. For any changes, always create your own branch, push your changes to that branch, then make a pull request. Have someone else review and merge your pull request. All contact data is stored in contacts.yaml . The contact info is keyed by a 40-character hexadecimal ID which was generated from their email address when they were first added. An example entry is: 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : # ^ this is their ID FullName : Example A. User Profile : This is an example user. GitHub : ExampleUser # ContactInformation data requires authorization to view ContactInformation : DNs : - ... IM : ... PrimaryEmail : user@example.net PrimaryPhone : ... When making changes to the contact data, first see if a contact is already in the YAML file. Search the YAML file for their name. Be sure to try variations of their name if you don't find them -- someone may be listed as \"Dave\" or \"David\", or have a middle name or middle initial. Follow the instructions below for adding or updating a contact, as appropriate. Adding a new contact Danger Any new contacts need to have their association with the OSG verified by a known contact within the relevant VO, site, or project. When registering a new contact, first obtain the required contact information . After obtaining this information and verifying their association with the OSG, fill out the values in template-contacts.yaml and add it to contacts.yaml . To get the hash used as the ID, run email-hash on their email address. For example: $ cd contact # this is your local clone of the \"contact\" repo $ bin/email-hash user@example.net 25357f62c7ab2ae11ddda1efd272bb5435dbfacb Then your new entry will look like 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : FullName : Example A. User .... The FullName and Profile fields in the main section, and the PrimaryEmail field in the ContactInformation section are required. The PrimaryEmail field in the ContactInformation section should match the hash that you used for the ID. In addition, if they will be making pull requests against the topology repo, e.g. for updating site information, reporting downtime, or updating project or VO information, obtain their GitHub username and put it in the GitHub field. Editing a contact Once you have found a contact in the YAML file, edit the attributes by hand. If you want to add information that is not present for that contact, look at template-contacts.yaml to find out what the attributes are called. Note The ID of the contact never changes, even if the user's PrimaryEmail changes. Important If you change the contact's FullName , you must make the same change to every place that the contact is mentioned in the topology repo. Get the contact changes merged in first.","title":"Topology and Contacts Data"},{"location":"services/topology-contacts-data/#topology-and-contacts-data","text":"This is internal documentation intended for OSG Operations staff. It contains information about the data provided by https://topology.opensciencegrid.org . The topology data for the service is in https://github.com/opensciencegrid/topology , in the projects/ , topology/ , and virtual-organizations/ subdirectories. The contacts data is in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml .","title":"Topology and Contacts Data"},{"location":"services/topology-contacts-data/#topology-data","text":"Admins may request changes to data in the topology repo via either a GitHub pull request or a Freshdesk ticket. These changes can be to a project, a VO, or a resource. The registration document and topology README document should tell them how to do that. In the case of a GitHub pull request, you will need to provide IDs using the bin/next_ids tool in an up-to-date local clone of Topology and potentially fix-up other data. To assist the user, do one of the following, depending on the severity of the fixes required for the PR: For minor issues, submit a \"Comment\" review using GitHub suggestions and ask the user to incorporate your suggestions . For major issues, create a branch based off of their PR, make changes, and submit your own PR that closes the original user's PR. The CI checks should catch most errors but you should still review the YAML changes. Certain things to check are: Do contact names and IDs match what's in the contacts data? (See below for instructions on how to get that information.) If the person is not in the contacts data, you will need to add them before approving the PR. Is the PR submitter authorized to make changes to that project/VO/resource? Can you match them to a person affiliated with that project/VO/site? (The contacts data now includes the GitHub usernames for some people. See below for instructions on how to get that information.) Is their GitHub ID registered in the contact database and are they associated with the relevant resource, site, facility, or VO?","title":"Topology Data"},{"location":"services/topology-contacts-data/#retiring-resources","text":"A resource can be disabled in its topology yaml file by setting Active: false . However the resource entry should not be immediately deleted from the yaml file. One reason for this is that the WLCG accounting info configured for resources is used to determine which resources to send APEL numbers for. Removing resources prematurely could prevent resummarized GRACC data from getting sent appropriately. Resources that have been inactive for at least two years are eligible to be deleted from the topology database. The GRACC records for this resource can be inspected in Kibana . In the search bar, enter ProbeName:*\\:FQDN in the search bar, where FQDN is the FQDN defined for your resource For example, if your resource FQDN is cmsgrid01.hep.wisc.edu you would enter ProbeName:*\\:cmsgrid01.hep.wisc.edu In the upper-right corner, use the Time Range selection to pick \"Last 2 years\" With this criteria selected, Kibana will show you if it has received any records for this resource in the past two years. If there are no records returned, you may remove the resource from the resource group yaml file in the topology repo. Any downtime entries for this resource in the corresponding downtime yaml file for the resource group must be removed also. If you remove the last resource in the resource group yaml file, you should remove the resource group and corresponding downtime yaml files as well.","title":"Retiring resources"},{"location":"services/topology-contacts-data/#reviewing-project-prs","text":"New projects are typically created by the Research Facilitation team. Here are a few things to check: Did osg-bot warn about a \"New Organization\"? If so, search around in the projects directory and make sure the \"Organization\" in the YAML is not a typo or alternate spelling for an existing organization. grep around in the /projects/ directory for substrings of the organization. For example, if the new org is \"University of Wisconsin Madison\", do: $ grep -i wisconsin projects/*.yaml and you will see that it's supposed to be \"University of Wisconsin-Madison\". If the new organization is not a typo or alternate spelling, dismiss osg-bot's review with the comment \"new org is legit\". - If osg-bot included a message about \"Unrecognized InstitutionID\" alongside this warning, check that the \"InstitutionID\" field in the project contains an ID that's found in the OSG institutions database . The topology project web-form should automatically populate the InstitutionID field for known institutions, but may fail in the case of spelling discrepancies. - If the organization is absent from the institutions database, add a comment to the pull request mentioning @opensciencegrid/project-office to request permission to add the institution. Once you have obtained permission, search for the institution's canonical name in the Research Organization Registry before adding it to the database. - Is the project name is of the form <INSTITUTION>_<PINAME> , e.g. UWMadison_Parks ? (This is recommended but not required for new projects.) If so: Is the short name -> organization mapping for the institution in /mappings/project_institution.yaml (e.g. UWMadison: \"University of Wisconsin-Madison\" )? If not, ask the PR author to add it. Does the \"FieldOfScience\" in the YAML match one of the keys in /mappings/nsfscience.yaml ? (The list is also available on the left column of this CSV .) Is the \"Sponsor\" correct? The sponsor depends on where the users will be submitting jobs from: If they primarily submit from some CI Connect interface such as \"OSG Connect\", use: Sponsor : CampusGrid : Name : <CAMPUS_GRID> The campus grid name must be one of the ones in the /projects/_CAMPUS_GRIDS.yaml file.. Otherwise, the project must be sponsored by a VO: Sponsor : VirtualOrganization : Name : <VO> The VO name must be one of the ones in the /virtual-organizations/ dir.","title":"Reviewing project PRs"},{"location":"services/topology-contacts-data/#contacts-data","text":"The OSG keeps contact data for administrators and maintainers of OSG resources and VOs for the purpose of distributing security, software, and adminstrative (e.g., OSG All-Hands dates) announcements. Additionally, OSG contacts have the following abilities: View other contacts' information (via HTML and XML ) with a registered certificate Register resource downtimes for resources that they are listed as an administrative contact, if they have a registered GitHub ID Contact data is kept as editable YAML in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . The YAML file contains sensitive information and is only visible to people with access to that repo.","title":"Contacts Data"},{"location":"services/topology-contacts-data/#getting-access-to-the-contact-repo","text":"The contacts repo is hosted on BitBucket. You will need an Atlassian account for access to BitBucket. The account you use for OSG JIRA should work. Once you have an account, request access from Brian Lin, Mat Selmeci, or Derek Weitzel. You should then be able to go to https://bitbucket.org/opensciencegrid/contact/ .","title":"Getting access to the contact repo"},{"location":"services/topology-contacts-data/#using-the-contact-repo","text":"BitBucket is similar to GitHub except you don't make a fork of the contact repo, you just clone it to your local machine. This means that any pushes go directly to the main repo instead of your own fork. Danger Don't push to master. For any changes, always create your own branch, push your changes to that branch, then make a pull request. Have someone else review and merge your pull request. All contact data is stored in contacts.yaml . The contact info is keyed by a 40-character hexadecimal ID which was generated from their email address when they were first added. An example entry is: 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : # ^ this is their ID FullName : Example A. User Profile : This is an example user. GitHub : ExampleUser # ContactInformation data requires authorization to view ContactInformation : DNs : - ... IM : ... PrimaryEmail : user@example.net PrimaryPhone : ... When making changes to the contact data, first see if a contact is already in the YAML file. Search the YAML file for their name. Be sure to try variations of their name if you don't find them -- someone may be listed as \"Dave\" or \"David\", or have a middle name or middle initial. Follow the instructions below for adding or updating a contact, as appropriate.","title":"Using the contact repo"},{"location":"services/topology-contacts-data/#adding-a-new-contact","text":"Danger Any new contacts need to have their association with the OSG verified by a known contact within the relevant VO, site, or project. When registering a new contact, first obtain the required contact information . After obtaining this information and verifying their association with the OSG, fill out the values in template-contacts.yaml and add it to contacts.yaml . To get the hash used as the ID, run email-hash on their email address. For example: $ cd contact # this is your local clone of the \"contact\" repo $ bin/email-hash user@example.net 25357f62c7ab2ae11ddda1efd272bb5435dbfacb Then your new entry will look like 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : FullName : Example A. User .... The FullName and Profile fields in the main section, and the PrimaryEmail field in the ContactInformation section are required. The PrimaryEmail field in the ContactInformation section should match the hash that you used for the ID. In addition, if they will be making pull requests against the topology repo, e.g. for updating site information, reporting downtime, or updating project or VO information, obtain their GitHub username and put it in the GitHub field.","title":"Adding a new contact"},{"location":"services/topology-contacts-data/#editing-a-contact","text":"Once you have found a contact in the YAML file, edit the attributes by hand. If you want to add information that is not present for that contact, look at template-contacts.yaml to find out what the attributes are called. Note The ID of the contact never changes, even if the user's PrimaryEmail changes. Important If you change the contact's FullName , you must make the same change to every place that the contact is mentioned in the topology repo. Get the contact changes merged in first.","title":"Editing a contact"},{"location":"services/topology/","text":"Topology Service This document contains information about the service that runs: https://topology.opensciencegrid.org https://topology-itb.opensciencegrid.org https://map.opensciencegrid.org : Generates the topology map used on OSG Display The source code for the service is in https://github.com/opensciencegrid/topology , in the src/ subdirectory. This repository also contains the public part of the data that gets served. Deployment Topology is a webapp run with Apache on the host topology.opensciencegrid.org . The ITB instance runs on the host topology-itb.opensciencegrid.org . The hosts are VMs at Nebraska; for SSH access, contact Derek Weitzel or Brian Bockelman. Installation These instructions assume an EL 7 host with the EPEL repositories available. The software will be installed into /opt/topology . A second instance for the webhook app will be installed into /opt/topology-webhook . (The ITB instance should be installed into /opt/topology-itb and /opt/topology-itb-webhook instead.) The following steps should be done as root. Install prerequisites: # yum install python36 gridsite httpd mod_ssl Clone the repository: For the production topology host: # git clone https://github.com/opensciencegrid/topology /opt/topology # git clone https://github.com/opensciencegrid/topology /opt/topology-webhook For the topology-itb host: # git clone https://github.com/opensciencegrid/topology /opt/topology-itb # git clone https://github.com/opensciencegrid/topology /opt/topology-itb-webhook Set up the virtualenv in the clone -- from /opt/topology or /opt/topology-itb : # python36 -m venv venv # . ./venv/bin/activate # pip install -r requirements-apache.txt Repeat for the webhook instance -- from /opt/topology-webhook or /opt/topology-itb-webhook . File system locations The following files/directories must exist and have the proper permissions: Location Purpose Ownership Mode /opt/topology Production software install root:root 0755 /opt/topology-itb ITB software install root:root 0755 /opt/topology-webhook Production webhook software install root:root 0755 /opt/topology-itb-webhook ITB webhook software install root:root 0755 /etc/opt/topology/config-production.py Production config root:root 0644 /etc/opt/topology/config-itb.py ITB config root:root 0644 /etc/opt/topology/bitbucket Private key for contact info repo apache:root 0600 /etc/opt/topology/bitbucket.pub Public key for contact info repo apache:root 0644 /etc/opt/topology/github Private key for pushing automerge commits topomerge:root 0600 /etc/opt/topology/github.pub Public key for pushing automerge commits topomerge:root 0644 /etc/opt/topology/github_webhook_secret GitHub webhook secret for validating webhooks topomerge:root 0600 ~apache/.ssh SSH dir for Apache apache:root 0700 ~apache/.ssh/known_hosts Known hosts file for Apache apache:root 0644 ~topomerge Home dir for topomerge Apache user topomerge:root 0755 ~topomerge/.ssh SSH dir for topomerge Apache user topomerge:root 0700 ~topomerge/.ssh/known_hosts Known hosts file for topomerge Apache user topomerge:root 0644 /var/cache/topology Checkouts of topology and contacts data for production instance apache:apache 0755 /var/cache/topology-itb Checkouts of topology and contacts data for ITB instance apache:apache 0755 /var/cache/topology-webhook Topology repo and state info for production webhook instance topomerge:topomerge 0755 /var/cache/topology-itb-webhook Topology repo and state info for ITB webhook instance topomerge:topomerge 0755 ~apache/.ssh/known_hosts must contain an entry for bitbucket.org ; use ssh-keyscan bitbucket.org to get the appropriate entry. ~topomerge/.ssh/known_hosts must contain an entry for github.com ; use ssh-keyscan github.com to get the appropriate entry. Software configuration Configuration for the main app is under /etc/opt/topology/ , in config-production.py and config-itb.py . The webhook app configuration is in config-production-webhook.py and config-itb-webhook.py . The files are in Python format and override default settings in src/webapp/default_config.py in the topology repo. HTTPD configuration is in /etc/httpd ; we use the modules mod_ssl , mod_gridsite , and mod_wsgi . The first two are installed via yum; the .so file for mod_wsgi is located in /opt/topology/venv/lib/python3.6/site-packages/mod_wsgi/server/ or /opt/topology-itb/venv/lib/python3.6/site-packages/mod_wsgi/server/ for the ITB instance. Each of the hostnames are VHosts in the apache configuration. Some special notes: https://map.opensciencegrid.org runs in the same wsgi process as the production topology, but the URL is limited to only the map code. Further, it does not use mod_gridsite so that users are not asked to present a client certificate. VHosts are configured: ServerName topology.opensciencegrid.org ServerAlias my.opensciencegrid.org myosg.opensciencegrid.org Data configuration Configuration is in /etc/opt/topology/config-production.py and config-itb.py ; and config-production-webhook.py and config-itb-webhook.py . Variable Purpose TOPOLOGY_DATA_DIR The directory containing a clone of the topology repository for data use TOPOLOGY_DATA_REPO The remote tracking repository of TOPOLOGY_DATA_DIR TOPOLOGY_DATA_BRANCH The remote tracking branch of TOPOLOGY_DATA_DIR WEBHOOK_DATA_DIR The directory containing a mirror-clone of the topology repository for webhook use WEBHOOK_DATA_REPO The remote tracking repository of WEBHOOK_DATA_DIR WEBHOOK_DATA_BRANCH The remote tracking branch of WEBHOOK_DATA_DIR WEBHOOK_STATE_DIR Directory containing webhook state information between pull request and status hooks WEBHOOK_SECRET_KEY Secret key configured on GitHub for webhook delivery CONTACT_DATA_DIR The directory containing a clone of the contact repository for data use CONTACT_DATA_REPO The remote tracking repository of CONTACT_DATA_DIR (default: \"git@bitbucket.org:opensciencegrid/contact.git\" ) CONTACT_DATA_BRANCH The remote tracking branch of CONTACT_DATA_BRANCH (default: \"master\" ) CACHE_LIFETIME Frequency of automatic data updates in seconds (default: 900 ) GIT_SSH_KEY Location of ssh public key file for git access. /etc/opt/topology/bitbucket.pub for the main app, and /etc/opt/topology/github.pub for the webhook app Puppet ensures that the production contact and topology clones are up to date with their configured remote tracking repo and branch. Puppet does not manage the ITB data directories so they need to be updated by hand during testing. GitHub Configuration for Webhook App Go to the https://github.com/opensciencegrid/topology/settings/hooks page on GitHub. There are four webhooks to set up; pull_request and status for both the topology and topology-itb hosts. Payload URL Content type Events to trigger webhook https://topology.opensciencegrid.org/webhook/status application/json Statuses https://topology.opensciencegrid.org/webhook/pull_request application/json Pull requests https://topology-itb.opensciencegrid.org/webhook/status application/json Statuses https://topology-itb.opensciencegrid.org/webhook/pull_request application/json Pull requests For each webhook, \"Secret\" should be a random 40 digit hex string, which should match the contents of the file /etc/opt/topology/github_webhook_secret (the path configured in WEBHOOK_SECRET_KEY ). The OSG's dedicated GitHub user for automating pushes is currently osg-bot . This user needs to have write access to the topology repo on GitHub. The ssh public key in /etc/opt/topology/github.pub should be registered with the osg-bot GitHub user. This can be done by logging into GitHub as osg-bot , and adding the new ssh key under the settings page. Required System Packages Currently the webhook app uses the mailx command to send email. If not already installed, install it with: :::console # yum install mailx Testing changes on the ITB instance All changes should be tested on the ITB instance before deploying to production. If you can, test them on your local machine first. These instructions assume that the code has not been merged to master. Update the ITB software installation at /opt/topology-itb and note the current branch: # cd /opt/topology-itb # git fetch --all # git status Check out the branch you are testing. If the target remote is not configured, add it : # git checkout -b <BRANCH> <REMOTE>/<BRANCH NAME> Verify that you are using the intended data associated with the code you are testing: If the data format has changed in an incompatible way, modify /etc/opt/topology/config-itb.py : Backup the ITB configuration file: # cd /etc/opt/topology # cp -p config-itb.py { ,.bak } Change the TOPOLOGY_DATA_DIR and/or CONTACT_DATA_DIR lines to point to a new directories so the previous data does not get overwritten with incompatible data. If you need to use a different branch for the data, switch to it: Check the branch of TOPOLOGY_DATA_DIR from /etc/opt/topology/config-itb.py # cd <TOPOLOGY_DATA_DIR> # git fetch --all # git status Note the previous branch, you will need this later If the target remote is not configured, add it Check out the target branch: # git checkout -b <BRANCH NAME> <REMOTE>/<BRANCH NAME> Pull any upstream changes to ensure that your branch is up to date: # git pull For updates to the webhook app, follow the above instructions for the ITB webhook instance under /opt/topology-itb-webhook and its corresponding config file, /etc/opt/topology/config-itb-webhook.py . Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log . Reverting changes Switch /opt/topology-itb to the previous branch: # cd /opt/topology-itb # git checkout <BRANCH> For updates to the webhook app, switch /opt/topology-itb-webhook to the previous master: # cd /opt/topology-itb-webhook # git checkout <BRANCH> If you made config changes to /etc/opt/topology/config-itb.py or config-itb-webhook.py , restore the backup. If you checked out a different branch for data, revert it back to the old branch. Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Updating the production instance Updating the production instance is similar to updating ITB instance. Update master on the Git clone at /opt/topology : # cd /opt/topology # git pull origin master For updates to the webhook app, update master on the Git clone at /opt/topology-webhook : # cd /opt/topology-webhook # git pull origin master Make config changes to /etc/opt/topology/config-production.py and/or config-production-webhook.py if necessary. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log . Reverting changes Switch /opt/topology to the previous master: # cd /opt/topology # ## (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> For updates to the webhook app, switch /opt/topology-webhook to the previous master: # cd /opt/topology-webhook ### (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> If you made config changes to /etc/opt/topology/config-production.py or config-production-webhook.py , revert them. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org .","title":"Topology Service"},{"location":"services/topology/#topology-service","text":"This document contains information about the service that runs: https://topology.opensciencegrid.org https://topology-itb.opensciencegrid.org https://map.opensciencegrid.org : Generates the topology map used on OSG Display The source code for the service is in https://github.com/opensciencegrid/topology , in the src/ subdirectory. This repository also contains the public part of the data that gets served.","title":"Topology Service"},{"location":"services/topology/#deployment","text":"Topology is a webapp run with Apache on the host topology.opensciencegrid.org . The ITB instance runs on the host topology-itb.opensciencegrid.org . The hosts are VMs at Nebraska; for SSH access, contact Derek Weitzel or Brian Bockelman.","title":"Deployment"},{"location":"services/topology/#installation","text":"These instructions assume an EL 7 host with the EPEL repositories available. The software will be installed into /opt/topology . A second instance for the webhook app will be installed into /opt/topology-webhook . (The ITB instance should be installed into /opt/topology-itb and /opt/topology-itb-webhook instead.) The following steps should be done as root. Install prerequisites: # yum install python36 gridsite httpd mod_ssl Clone the repository: For the production topology host: # git clone https://github.com/opensciencegrid/topology /opt/topology # git clone https://github.com/opensciencegrid/topology /opt/topology-webhook For the topology-itb host: # git clone https://github.com/opensciencegrid/topology /opt/topology-itb # git clone https://github.com/opensciencegrid/topology /opt/topology-itb-webhook Set up the virtualenv in the clone -- from /opt/topology or /opt/topology-itb : # python36 -m venv venv # . ./venv/bin/activate # pip install -r requirements-apache.txt Repeat for the webhook instance -- from /opt/topology-webhook or /opt/topology-itb-webhook .","title":"Installation"},{"location":"services/topology/#file-system-locations","text":"The following files/directories must exist and have the proper permissions: Location Purpose Ownership Mode /opt/topology Production software install root:root 0755 /opt/topology-itb ITB software install root:root 0755 /opt/topology-webhook Production webhook software install root:root 0755 /opt/topology-itb-webhook ITB webhook software install root:root 0755 /etc/opt/topology/config-production.py Production config root:root 0644 /etc/opt/topology/config-itb.py ITB config root:root 0644 /etc/opt/topology/bitbucket Private key for contact info repo apache:root 0600 /etc/opt/topology/bitbucket.pub Public key for contact info repo apache:root 0644 /etc/opt/topology/github Private key for pushing automerge commits topomerge:root 0600 /etc/opt/topology/github.pub Public key for pushing automerge commits topomerge:root 0644 /etc/opt/topology/github_webhook_secret GitHub webhook secret for validating webhooks topomerge:root 0600 ~apache/.ssh SSH dir for Apache apache:root 0700 ~apache/.ssh/known_hosts Known hosts file for Apache apache:root 0644 ~topomerge Home dir for topomerge Apache user topomerge:root 0755 ~topomerge/.ssh SSH dir for topomerge Apache user topomerge:root 0700 ~topomerge/.ssh/known_hosts Known hosts file for topomerge Apache user topomerge:root 0644 /var/cache/topology Checkouts of topology and contacts data for production instance apache:apache 0755 /var/cache/topology-itb Checkouts of topology and contacts data for ITB instance apache:apache 0755 /var/cache/topology-webhook Topology repo and state info for production webhook instance topomerge:topomerge 0755 /var/cache/topology-itb-webhook Topology repo and state info for ITB webhook instance topomerge:topomerge 0755 ~apache/.ssh/known_hosts must contain an entry for bitbucket.org ; use ssh-keyscan bitbucket.org to get the appropriate entry. ~topomerge/.ssh/known_hosts must contain an entry for github.com ; use ssh-keyscan github.com to get the appropriate entry.","title":"File system locations"},{"location":"services/topology/#software-configuration","text":"Configuration for the main app is under /etc/opt/topology/ , in config-production.py and config-itb.py . The webhook app configuration is in config-production-webhook.py and config-itb-webhook.py . The files are in Python format and override default settings in src/webapp/default_config.py in the topology repo. HTTPD configuration is in /etc/httpd ; we use the modules mod_ssl , mod_gridsite , and mod_wsgi . The first two are installed via yum; the .so file for mod_wsgi is located in /opt/topology/venv/lib/python3.6/site-packages/mod_wsgi/server/ or /opt/topology-itb/venv/lib/python3.6/site-packages/mod_wsgi/server/ for the ITB instance. Each of the hostnames are VHosts in the apache configuration. Some special notes: https://map.opensciencegrid.org runs in the same wsgi process as the production topology, but the URL is limited to only the map code. Further, it does not use mod_gridsite so that users are not asked to present a client certificate. VHosts are configured: ServerName topology.opensciencegrid.org ServerAlias my.opensciencegrid.org myosg.opensciencegrid.org","title":"Software configuration"},{"location":"services/topology/#data-configuration","text":"Configuration is in /etc/opt/topology/config-production.py and config-itb.py ; and config-production-webhook.py and config-itb-webhook.py . Variable Purpose TOPOLOGY_DATA_DIR The directory containing a clone of the topology repository for data use TOPOLOGY_DATA_REPO The remote tracking repository of TOPOLOGY_DATA_DIR TOPOLOGY_DATA_BRANCH The remote tracking branch of TOPOLOGY_DATA_DIR WEBHOOK_DATA_DIR The directory containing a mirror-clone of the topology repository for webhook use WEBHOOK_DATA_REPO The remote tracking repository of WEBHOOK_DATA_DIR WEBHOOK_DATA_BRANCH The remote tracking branch of WEBHOOK_DATA_DIR WEBHOOK_STATE_DIR Directory containing webhook state information between pull request and status hooks WEBHOOK_SECRET_KEY Secret key configured on GitHub for webhook delivery CONTACT_DATA_DIR The directory containing a clone of the contact repository for data use CONTACT_DATA_REPO The remote tracking repository of CONTACT_DATA_DIR (default: \"git@bitbucket.org:opensciencegrid/contact.git\" ) CONTACT_DATA_BRANCH The remote tracking branch of CONTACT_DATA_BRANCH (default: \"master\" ) CACHE_LIFETIME Frequency of automatic data updates in seconds (default: 900 ) GIT_SSH_KEY Location of ssh public key file for git access. /etc/opt/topology/bitbucket.pub for the main app, and /etc/opt/topology/github.pub for the webhook app Puppet ensures that the production contact and topology clones are up to date with their configured remote tracking repo and branch. Puppet does not manage the ITB data directories so they need to be updated by hand during testing.","title":"Data configuration"},{"location":"services/topology/#github-configuration-for-webhook-app","text":"Go to the https://github.com/opensciencegrid/topology/settings/hooks page on GitHub. There are four webhooks to set up; pull_request and status for both the topology and topology-itb hosts. Payload URL Content type Events to trigger webhook https://topology.opensciencegrid.org/webhook/status application/json Statuses https://topology.opensciencegrid.org/webhook/pull_request application/json Pull requests https://topology-itb.opensciencegrid.org/webhook/status application/json Statuses https://topology-itb.opensciencegrid.org/webhook/pull_request application/json Pull requests For each webhook, \"Secret\" should be a random 40 digit hex string, which should match the contents of the file /etc/opt/topology/github_webhook_secret (the path configured in WEBHOOK_SECRET_KEY ). The OSG's dedicated GitHub user for automating pushes is currently osg-bot . This user needs to have write access to the topology repo on GitHub. The ssh public key in /etc/opt/topology/github.pub should be registered with the osg-bot GitHub user. This can be done by logging into GitHub as osg-bot , and adding the new ssh key under the settings page.","title":"GitHub Configuration for Webhook App"},{"location":"services/topology/#required-system-packages","text":"Currently the webhook app uses the mailx command to send email. If not already installed, install it with: :::console # yum install mailx","title":"Required System Packages"},{"location":"services/topology/#testing-changes-on-the-itb-instance","text":"All changes should be tested on the ITB instance before deploying to production. If you can, test them on your local machine first. These instructions assume that the code has not been merged to master. Update the ITB software installation at /opt/topology-itb and note the current branch: # cd /opt/topology-itb # git fetch --all # git status Check out the branch you are testing. If the target remote is not configured, add it : # git checkout -b <BRANCH> <REMOTE>/<BRANCH NAME> Verify that you are using the intended data associated with the code you are testing: If the data format has changed in an incompatible way, modify /etc/opt/topology/config-itb.py : Backup the ITB configuration file: # cd /etc/opt/topology # cp -p config-itb.py { ,.bak } Change the TOPOLOGY_DATA_DIR and/or CONTACT_DATA_DIR lines to point to a new directories so the previous data does not get overwritten with incompatible data. If you need to use a different branch for the data, switch to it: Check the branch of TOPOLOGY_DATA_DIR from /etc/opt/topology/config-itb.py # cd <TOPOLOGY_DATA_DIR> # git fetch --all # git status Note the previous branch, you will need this later If the target remote is not configured, add it Check out the target branch: # git checkout -b <BRANCH NAME> <REMOTE>/<BRANCH NAME> Pull any upstream changes to ensure that your branch is up to date: # git pull For updates to the webhook app, follow the above instructions for the ITB webhook instance under /opt/topology-itb-webhook and its corresponding config file, /etc/opt/topology/config-itb-webhook.py . Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log .","title":"Testing changes on the ITB instance"},{"location":"services/topology/#reverting-changes","text":"Switch /opt/topology-itb to the previous branch: # cd /opt/topology-itb # git checkout <BRANCH> For updates to the webhook app, switch /opt/topology-itb-webhook to the previous master: # cd /opt/topology-itb-webhook # git checkout <BRANCH> If you made config changes to /etc/opt/topology/config-itb.py or config-itb-webhook.py , restore the backup. If you checked out a different branch for data, revert it back to the old branch. Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org .","title":"Reverting changes"},{"location":"services/topology/#updating-the-production-instance","text":"Updating the production instance is similar to updating ITB instance. Update master on the Git clone at /opt/topology : # cd /opt/topology # git pull origin master For updates to the webhook app, update master on the Git clone at /opt/topology-webhook : # cd /opt/topology-webhook # git pull origin master Make config changes to /etc/opt/topology/config-production.py and/or config-production-webhook.py if necessary. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log .","title":"Updating the production instance"},{"location":"services/topology/#reverting-changes_1","text":"Switch /opt/topology to the previous master: # cd /opt/topology # ## (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> For updates to the webhook app, switch /opt/topology-webhook to the previous master: # cd /opt/topology-webhook ### (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> If you made config changes to /etc/opt/topology/config-production.py or config-production-webhook.py , revert them. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org .","title":"Reverting changes"},{"location":"troubleshooting/repository-scripts/","text":"Troubleshooting Guide for Yum Repository Scripts The repo.opensciencegrid.org and repo-itb.opensciencegrid.org hosts contain the OSG Yum software repositories plus related services and tools. In particular, the mash software is used to download RPMs from where they are built (at the University of Wisconsin\u2013Madison), and there are some associated scripts to configure and invoke mash periodically. Use this guide to monitor the mash system for problems and to perform basic troubleshooting when such problems arise. Monitoring To monitor the repository hosts for proper mash operation, do the following steps on each host: ssh to repo.opensciencegrid.org and cd into /var/log/repo to view logs from mash updates Examine the \u201cLast modified\u201d timestamp of all of the update_repo.*.log files If the timestamps are all less than 2 hours old, life is good and you can skip the remaining steps below Otherwise, examine the \u201cLast modified\u201d timestamp of the update_all_repos.err file If the update_all_repos.err timestamp is current, there may be a mash process that is hung; see the Troubleshooting steps below If all timestamps are more than 6 hours old, something may be wrong with cron or its mash entries: Verify that cron is running and that the cron entries for mash are still present; if not, try to restore things Otherwise, create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group Troubleshooting and Mitigation Identifying and fixing a hung mash process If a mash update process hangs, all future invocations from cron of the mash scripts will exit without taking action because of the hung process. Thus, it is important to identify and remove any hung processes so that future updates can proceed. Use the procedure below to remove any hung mash processes; doing so is safe in that it will not adversely affect the Yum repositories being served from the host. In the listing of log files (see above), view the file =update_all_repos.err= In the error log file, look for messages such as: Wed Jan 20 18:10:02 UTC 2016: **Can't acquire lock, is update_all_repos.sh already running?** This message indicates that the most recent update attempt quit early due to the presence of a lock file, most likely from a hung mash process. Look for mash processes: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND 24551 24549 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o 24552 24551 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o If there are mash processes that started on a previous date or more than 2 hours ago, it is best to remove their corresponding process groups (PGID above): root@host # kill -TERM -23455 Then verify that the old processes are gone using the same ps command as above: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND If any part of this process does not look or work as expected: Create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Troubleshooting Guide for Yum Repository Scripts"},{"location":"troubleshooting/repository-scripts/#troubleshooting-guide-for-yum-repository-scripts","text":"The repo.opensciencegrid.org and repo-itb.opensciencegrid.org hosts contain the OSG Yum software repositories plus related services and tools. In particular, the mash software is used to download RPMs from where they are built (at the University of Wisconsin\u2013Madison), and there are some associated scripts to configure and invoke mash periodically. Use this guide to monitor the mash system for problems and to perform basic troubleshooting when such problems arise.","title":"Troubleshooting Guide for Yum Repository Scripts"},{"location":"troubleshooting/repository-scripts/#monitoring","text":"To monitor the repository hosts for proper mash operation, do the following steps on each host: ssh to repo.opensciencegrid.org and cd into /var/log/repo to view logs from mash updates Examine the \u201cLast modified\u201d timestamp of all of the update_repo.*.log files If the timestamps are all less than 2 hours old, life is good and you can skip the remaining steps below Otherwise, examine the \u201cLast modified\u201d timestamp of the update_all_repos.err file If the update_all_repos.err timestamp is current, there may be a mash process that is hung; see the Troubleshooting steps below If all timestamps are more than 6 hours old, something may be wrong with cron or its mash entries: Verify that cron is running and that the cron entries for mash are still present; if not, try to restore things Otherwise, create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Monitoring"},{"location":"troubleshooting/repository-scripts/#troubleshooting-and-mitigation","text":"","title":"Troubleshooting and Mitigation"},{"location":"troubleshooting/repository-scripts/#identifying-and-fixing-a-hung-mash-process","text":"If a mash update process hangs, all future invocations from cron of the mash scripts will exit without taking action because of the hung process. Thus, it is important to identify and remove any hung processes so that future updates can proceed. Use the procedure below to remove any hung mash processes; doing so is safe in that it will not adversely affect the Yum repositories being served from the host. In the listing of log files (see above), view the file =update_all_repos.err= In the error log file, look for messages such as: Wed Jan 20 18:10:02 UTC 2016: **Can't acquire lock, is update_all_repos.sh already running?** This message indicates that the most recent update attempt quit early due to the presence of a lock file, most likely from a hung mash process. Look for mash processes: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND 24551 24549 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o 24552 24551 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o If there are mash processes that started on a previous date or more than 2 hours ago, it is best to remove their corresponding process groups (PGID above): root@host # kill -TERM -23455 Then verify that the old processes are gone using the same ps command as above: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND If any part of this process does not look or work as expected: Create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Identifying and fixing a hung mash process"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"OSG Operations Welcome to the home page of the OSG Operations Team documentation area! Mission The mission of OSG Operations is to maintain and improve distributed high throughput computing services to support research communities. This is accomplished by: Operating and maintaining our services in a user-oriented, robust, and reliable manner. Developing a professional and skilled staff dedicated to a service philosophy. Managing resources responsibly, efficiently, and with accountability. Evaluating and continually improving the actions, methods and processes that allow the OSG to operate. Contact Us Open a Ticket Slack channel - if you can't create an account, send an e-mail to help@opensciencegrid.org Email: help@opensciencegrid.org Registration (Contact, Resource, VO, or Project) Register with OSG Weekly Operations Meetings When: Fridays 12:30 pm Central URL: https://unl.zoom.us/j/183382852 Phone: +1 669 900 6833 or +1 408 638 0968 or +1 646 876 9923 Meeting ID: 183 382 852 (password required; available on request) Meeting Minutes September 13, 2024 September 6, 2024 August 23, 2024 August 16, 2024 August 9, 2024 August 2, 2024 July 26, 2024 July 19, 2024 July 5, 2024 June 28, 2024 June 21, 2024 June 14, 2024 June 7, 2024 May 31, 2024 May 24, 2024 May 17, 2024 May 10, 2024 May 3, 2024 April 26, 2024 April 19, 2024 April 12, 2024 April 5, 2024 March 29, 2024 (canceled) March 22, 2024 March 15, 2024 March 8, 2024 March 1, 2024 February 23, 2024 February 16, 2024 February 9, 2024 February 2, 2024 January 26, 2024 January 19, 2024 January 12, 2024 January 5, 2024 December 29, 2023 (canceled) December 22, 2023 (canceled) December 15, 2023 December 8, 2023 December 1, 2023 November 24, 2023 (canceled) November 17, 2023 November 10, 2023 November 3, 2023 October 27, 2023 October 20, 2023 October 13, 2023 October 6, 2023 September 29, 2023 September 22, 2023 September 15, 2023 September 8, 2023 September 1, 2023 August 25, 2023 August 18, 2023 August 11, 2023 August 4, 2023 July 28, 2023 July 21, 2023 January 14, 2023 (canceled due to Throughput Computing 23) July 7, 2023 June 30, 2023 June 23, 2023 June 16, 2023 June 9, 2023 June 2, 2023 May 26, 2023 May 19, 2023 May 12, 2023 May 5, 2023 April 28, 2023 April 21, 2023 April 14, 2023 April 7, 2023 March 31, 2023 March 24, 2023 March 17, 2023 March 10, 2023 March 3, 2023 February 24, 2023 February 17, 2023 February 10, 2023 February 3, 2023 January 27, 2023 January 20, 2023 January 13, 2023 January 6, 2023 (canceled) December 30, 2022 (canceled) December 23, 2022 (canceled) December 16, 2022 December 9, 2022 December 2, 2022 November 25, 2022 (canceled) November 18, 2022 November 11, 2022 November 4, 2022 October 28, 2022 October 21, 2022 October 14, 2022 October 7, 2022 September 30, 2022 September 23, 2022 (canceled) September 16, 2022 (canceled) September 9, 2022 September 2, 2022 August 26, 2022 August 19, 2022 August 12, 2022 August 5, 2022 (canceled) July 29, 2022 July 22, 2022 (canceled) July 15, 2022 July 8, 2022 July 1, 2022 June 24, 2022 June 17, 2022 June 10, 2022 June 3, 2022 May 27, 2022 May 20, 2022 May 13, 2022 May 6, 2022 (canceled) April 29, 2022 April 22, 2022 April 15, 2022 April 8, 2022 April 1, 2022 March 25, 2022 March 18, 2022 (canceled) March 11, 2022 March 4, 2022 February 25, 2022 February 18, 2022 February 11, 2022 February 4, 2022 January 28, 2022 January 21, 2022 January 14, 2022 January 7, 2022 December 31, 2021 (canceled) December 24, 2021 (canceled) December 17, 2021 December 10, 2021 December 3, 2021 November 26, 2021 (canceled) November 19, 2021 November 12, 2021 November 5, 2021 October 29, 2021 October 22, 2021 October 15, 2021 (canceled) October 8, 2021 October 1, 2021 September 24, 2021 September 17, 2021 September 10, 2021 September 3, 2021 August 27, 2021 August 20, 2021 August 13, 2021 August 6, 2021 July 30, 2021 July 23, 2021 July 16, 2021 July 9, 2021 July 2, 2021 June 25, 2021 June 18, 2021 June 11, 2021 June 4, 2021 May 28, 2021 May 21, 2021 May 14, 2021 May 7, 2021 April 30, 2021 April 23, 2021 April 16, 2021 April 9, 2021 April 2, 2021 March 26, 2021 March 19, 2021 March 12, 2021 March 5, 2021 (canceled) February 26, 2021 February 19, 2021 February 12, 2021 February 5, 2021 January 29, 2021 January 22, 2021 January 15, 2021 January 8, 2021 January 1, 2021 (canceled) December 25, 2020 (canceled) December 18, 2020 December 11, 2020 December 4, 2020 November 20, 2020 November 13, 2020 November 6, 2020 October 30, 2020 October 23, 2020 October 16, 2020 October 9, 2020 October 2, 2020 September 25, 2020 September 18, 2020 September 11, 2020 September 4, 2020 (canceled) August 28, 2020 August 21, 2020 August 14, 2020 August 7, 2020 July 31, 2020 July 24, 2020 July 17, 2020 July 10, 2020 July 3, 2020 (canceled) June 26, 2020 June 19, 2020 June 12, 2020 June 5, 2020 May 29, 2020 (canceled) May 22, 2020 May 15, 2020 May 8, 2020 May 1, 2020 April 24, 2020 April 17, 2020 April 10, 2020 April 3, 2020 March 27, 2020 March 20, 2020 March 13, 2020 March 6, 2020 February 28, 2020 February 21, 2020 February 14, 2020 February 7, 2020 January 31, 2020 January 24, 2020 January 17, 2020 January 10, 2020 January 3, 2020 December 27, 2019 December 20, 2019 December 13, 2019 December 6, 2019 November 29, 2019 (canceled) November 22, 2019 November 15, 2019 November 8, 2019 November 1, 2019 October 25, 2019 October 18, 2019 October 11, 2019 October 4, 2019 September 27, 2019 September 20, 2019 September 13, 2019 September 6, 2019 August 30, 2019 August 23, 2019 August 16, 2019 August 9, 2019 August 2, 2019 July 26, 2019 July 19, 2019 July 12, 2019 July 8, 2019 July 1, 2019 June 24, 2019 June 17, 2019 June 10, 2019 June 3, 2019 May 28, 2019 May 20, 2019 May 13, 2019 May 6, 2019 April 29, 2019 April 22, 2019 April 15, 2019 April 8, 2019 April 1, 2019 March 25, 2019 March 18, 2019 (canceled due to HOW 2019) March 11, 2019 March 4, 2019 February 25, 2019 February 19, 2019 February 11, 2019 February 4, 2019 January 28, 2019 (canceled due to F2F meeting) January 22, 2019 January 14, 2019 January 7, 2019 December 31, 2018 (canceled) December 24, 2018 (canceled) December 17, 2018 December 10, 2018 December 3, 2018 November 26, 2018 November 19, 2018 November 13, 2018 November 5, 2018 (canceled) October 29, 2018 (canceled) October 22, 2018 (canceled) October 15, 2018 October 8, 2018 October 1, 2018 September 24, 2018 September 17, 2018 September 10, 2018 September 4, 2018 August 27, 2018 August 20, 2018 August 13, 2018 August 6, 2018 Archived Meeting Minutes For archived meeting minutes, see the GitHub repository","title":"Home"},{"location":"#osg-operations","text":"Welcome to the home page of the OSG Operations Team documentation area!","title":"OSG Operations"},{"location":"#mission","text":"The mission of OSG Operations is to maintain and improve distributed high throughput computing services to support research communities. This is accomplished by: Operating and maintaining our services in a user-oriented, robust, and reliable manner. Developing a professional and skilled staff dedicated to a service philosophy. Managing resources responsibly, efficiently, and with accountability. Evaluating and continually improving the actions, methods and processes that allow the OSG to operate.","title":"Mission"},{"location":"#contact-us","text":"Open a Ticket Slack channel - if you can't create an account, send an e-mail to help@opensciencegrid.org Email: help@opensciencegrid.org","title":"Contact Us"},{"location":"#registration-contact-resource-vo-or-project","text":"Register with OSG","title":"Registration (Contact, Resource, VO, or Project)"},{"location":"#weekly-operations-meetings","text":"When: Fridays 12:30 pm Central URL: https://unl.zoom.us/j/183382852 Phone: +1 669 900 6833 or +1 408 638 0968 or +1 646 876 9923 Meeting ID: 183 382 852 (password required; available on request)","title":"Weekly Operations Meetings"},{"location":"#meeting-minutes","text":"September 13, 2024 September 6, 2024 August 23, 2024 August 16, 2024 August 9, 2024 August 2, 2024 July 26, 2024 July 19, 2024 July 5, 2024 June 28, 2024 June 21, 2024 June 14, 2024 June 7, 2024 May 31, 2024 May 24, 2024 May 17, 2024 May 10, 2024 May 3, 2024 April 26, 2024 April 19, 2024 April 12, 2024 April 5, 2024 March 29, 2024 (canceled) March 22, 2024 March 15, 2024 March 8, 2024 March 1, 2024 February 23, 2024 February 16, 2024 February 9, 2024 February 2, 2024 January 26, 2024 January 19, 2024 January 12, 2024 January 5, 2024 December 29, 2023 (canceled) December 22, 2023 (canceled) December 15, 2023 December 8, 2023 December 1, 2023 November 24, 2023 (canceled) November 17, 2023 November 10, 2023 November 3, 2023 October 27, 2023 October 20, 2023 October 13, 2023 October 6, 2023 September 29, 2023 September 22, 2023 September 15, 2023 September 8, 2023 September 1, 2023 August 25, 2023 August 18, 2023 August 11, 2023 August 4, 2023 July 28, 2023 July 21, 2023 January 14, 2023 (canceled due to Throughput Computing 23) July 7, 2023 June 30, 2023 June 23, 2023 June 16, 2023 June 9, 2023 June 2, 2023 May 26, 2023 May 19, 2023 May 12, 2023 May 5, 2023 April 28, 2023 April 21, 2023 April 14, 2023 April 7, 2023 March 31, 2023 March 24, 2023 March 17, 2023 March 10, 2023 March 3, 2023 February 24, 2023 February 17, 2023 February 10, 2023 February 3, 2023 January 27, 2023 January 20, 2023 January 13, 2023 January 6, 2023 (canceled) December 30, 2022 (canceled) December 23, 2022 (canceled) December 16, 2022 December 9, 2022 December 2, 2022 November 25, 2022 (canceled) November 18, 2022 November 11, 2022 November 4, 2022 October 28, 2022 October 21, 2022 October 14, 2022 October 7, 2022 September 30, 2022 September 23, 2022 (canceled) September 16, 2022 (canceled) September 9, 2022 September 2, 2022 August 26, 2022 August 19, 2022 August 12, 2022 August 5, 2022 (canceled) July 29, 2022 July 22, 2022 (canceled) July 15, 2022 July 8, 2022 July 1, 2022 June 24, 2022 June 17, 2022 June 10, 2022 June 3, 2022 May 27, 2022 May 20, 2022 May 13, 2022 May 6, 2022 (canceled) April 29, 2022 April 22, 2022 April 15, 2022 April 8, 2022 April 1, 2022 March 25, 2022 March 18, 2022 (canceled) March 11, 2022 March 4, 2022 February 25, 2022 February 18, 2022 February 11, 2022 February 4, 2022 January 28, 2022 January 21, 2022 January 14, 2022 January 7, 2022 December 31, 2021 (canceled) December 24, 2021 (canceled) December 17, 2021 December 10, 2021 December 3, 2021 November 26, 2021 (canceled) November 19, 2021 November 12, 2021 November 5, 2021 October 29, 2021 October 22, 2021 October 15, 2021 (canceled) October 8, 2021 October 1, 2021 September 24, 2021 September 17, 2021 September 10, 2021 September 3, 2021 August 27, 2021 August 20, 2021 August 13, 2021 August 6, 2021 July 30, 2021 July 23, 2021 July 16, 2021 July 9, 2021 July 2, 2021 June 25, 2021 June 18, 2021 June 11, 2021 June 4, 2021 May 28, 2021 May 21, 2021 May 14, 2021 May 7, 2021 April 30, 2021 April 23, 2021 April 16, 2021 April 9, 2021 April 2, 2021 March 26, 2021 March 19, 2021 March 12, 2021 March 5, 2021 (canceled) February 26, 2021 February 19, 2021 February 12, 2021 February 5, 2021 January 29, 2021 January 22, 2021 January 15, 2021 January 8, 2021 January 1, 2021 (canceled) December 25, 2020 (canceled) December 18, 2020 December 11, 2020 December 4, 2020 November 20, 2020 November 13, 2020 November 6, 2020 October 30, 2020 October 23, 2020 October 16, 2020 October 9, 2020 October 2, 2020 September 25, 2020 September 18, 2020 September 11, 2020 September 4, 2020 (canceled) August 28, 2020 August 21, 2020 August 14, 2020 August 7, 2020 July 31, 2020 July 24, 2020 July 17, 2020 July 10, 2020 July 3, 2020 (canceled) June 26, 2020 June 19, 2020 June 12, 2020 June 5, 2020 May 29, 2020 (canceled) May 22, 2020 May 15, 2020 May 8, 2020 May 1, 2020 April 24, 2020 April 17, 2020 April 10, 2020 April 3, 2020 March 27, 2020 March 20, 2020 March 13, 2020 March 6, 2020 February 28, 2020 February 21, 2020 February 14, 2020 February 7, 2020 January 31, 2020 January 24, 2020 January 17, 2020 January 10, 2020 January 3, 2020 December 27, 2019 December 20, 2019 December 13, 2019 December 6, 2019 November 29, 2019 (canceled) November 22, 2019 November 15, 2019 November 8, 2019 November 1, 2019 October 25, 2019 October 18, 2019 October 11, 2019 October 4, 2019 September 27, 2019 September 20, 2019 September 13, 2019 September 6, 2019 August 30, 2019 August 23, 2019 August 16, 2019 August 9, 2019 August 2, 2019 July 26, 2019 July 19, 2019 July 12, 2019 July 8, 2019 July 1, 2019 June 24, 2019 June 17, 2019 June 10, 2019 June 3, 2019 May 28, 2019 May 20, 2019 May 13, 2019 May 6, 2019 April 29, 2019 April 22, 2019 April 15, 2019 April 8, 2019 April 1, 2019 March 25, 2019 March 18, 2019 (canceled due to HOW 2019) March 11, 2019 March 4, 2019 February 25, 2019 February 19, 2019 February 11, 2019 February 4, 2019 January 28, 2019 (canceled due to F2F meeting) January 22, 2019 January 14, 2019 January 7, 2019 December 31, 2018 (canceled) December 24, 2018 (canceled) December 17, 2018 December 10, 2018 December 3, 2018 November 26, 2018 November 19, 2018 November 13, 2018 November 5, 2018 (canceled) October 29, 2018 (canceled) October 22, 2018 (canceled) October 15, 2018 October 8, 2018 October 1, 2018 September 24, 2018 September 17, 2018 September 10, 2018 September 4, 2018 August 27, 2018 August 20, 2018 August 13, 2018 August 6, 2018","title":"Meeting Minutes"},{"location":"#archived-meeting-minutes","text":"For archived meeting minutes, see the GitHub repository","title":"Archived Meeting Minutes"},{"location":"external-oasis-repos/","text":"External OASIS Repositories We offer hosting of non-OSG CVMFS repositories on OASIS. This means that requests to create, rename, remove, or blanking OASIS repositories will come in as GOC tickets. This document contains instructions for handling those tickets. Also see Policy for OSG Mirroring of External CVMFS repositories External OASIS repository Requests to Host a Repository on OASIS Ensure that the repository administrator is valid for the VO. This can be done by (a) OSG already having a relationship with the person or (b) the contacting the VO manager to find out. Also, the person should be listed in the OSG topology contacts list . Review provided URL and verify that it is appropriate for the VO and no other project uses it already. In order to make sure the name in URL is appropriate, check that the name is derived from the VO name or one of its projects. Then, add the repository URL to the topology for given VO under the OASISRepoURLs . This should cause the repository's configuration to be added to the OSG Stratum-0 within 15 minutes after URL is added into the topology. For example, if new URL is for the VO DUNE http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org edit the following under the OASIS section and create PR: git clone git://github.com/opensciencegrid/topology.git vim topology/virtual-organizations/DUNE.yaml ... OASIS: OASISRepoURLs: - http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org/ ... When the PR is approved, check on the oasis.opensciencegrid.org host whether the new repository was successfuly signed. There should be message about it in the log file /var/log/oasis/generate_whitelists.log : Tue Sep 25 17:34:02 2018 Running add_osg_repository http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org dune.osgstorage.org: Signing 7 day whitelist with masterkeycard... done If the respository ends in a new domain name that has not been distributed before, a new domain key will be needed on oasis-replica which should get automatically downloaded from the etc/cvmfs/keys directory in the master branch of the config-repo github repository . There should be a message about downloading it in the log file /var/log/cvmfs/generate_replicas.log . After the key is downloaded the repository should also be automatically added, with messages in the same log file. After the repository is successfully on oasis-replica, in addition you need to update the OSG configuration repository. Make changes in a workspace cloned from the config-repo github repository and use the osg branch (or a branch made from it) in a personal account on oasis-itb . Add a domain configuration in etc/cvmfs/domain.d that's a lot like one of the other imported domains, for example egi.eu.conf . The server urls might be slightly different; use the URLs of the stratum 1s where it is already hosted if there are any, and you can add at least the FNAL and BNL stratum 1s. Copy key(s) for the domain into etc/cvmfs/keys from the master branch, either a single .pub file or a directory, whichever the master branch has. Test all these changes out on the config-osg.opensciencegrid.org repository on oasis-itb using the copy_config_osg command, and configure a test client to read from oasis-itb.opensciencegrid.org instead of oasis.opensciencegrid.org . Then commit those changes into a new branch you made from the osg branch, and make a pull request. Once that PR is approved and merged, log in to the oasis machine and run copy_config_osg as root there to copy from github to the production configuration repository on the oasis machine. If the repository name does not match *.opensciencegrid.org or *.osgstorage.org , skip this step and go on to your next step. If it does match one of those two patterns, then respond to the ticket to tell the administrator to continue with their next step (their step 4). We don't want them to continue before 15 minutes has elapsed after step 2 above, so either wait that much time or tell them the time they may proceed (15 minutes after you updated topology). Then wait until the admin has updated the ticket to indicate that they have completed their step before moving on. Ask the administrator of the BNL stratum 1 (John De Stefano) to also add the new repository. The BNL Stratum-1 administrator should set the service to read from http://oasis-replica.opensciencegrid.org:8002/cvmfs/<EXAMPLE.OPENSCIENCEGRID.ORG> . When the BNL Stratum-1 administrator has reported back that the replication is ready, respond to the requester that the repository is fully replicated on the OSG and close the ticket. Requests to Change the URL of an External Repository If there is a request to change the URL of an external repository, update the registered value in OASISRepoURLs for the respective VO in the topology. Tell the requester that it is ready 15 minutes after topology is updated. Requests to Remove an External Repository After validating that the ticket submitter is authorized by the VO's OASIS manager, delete the registered value for in topology for the VO in OASIS Repo URLs. Verify that it is removed by running the following on any oasis machine to make sure it is missing from the list: print_osg_repos|grep <EXAMPLE.OPENSCIENCEGRID.ORG> Check if the repository has been replicated to RAL by looking in their repositories.json . The user documentation requests the user to make a GGUS ticket to do this, so either ask them to do it or do it yourself. Add the BNL Stratum-1 operator (John De Stefano) to the ticket and ask him to remove the repository. Wait for him to finish before proceeding. Add the FNAL Stratum-1 operators (Merina Albert, Hyun Woo Kim) to the ticket and ask them when they can be ready to delete the repository. They can't remove it before it is removed from oasis-replica because their Stratum-1 automatically adds all repositories oasis-replica has. However, it has to be done within 8 hours of removal on oasis-replica or an alarm will start going off. Run the following command on oasis , oasis-itb , oasis-replica and oasis-replica-itb : remove_osg_repository -f <EXAMPLE.OPENSCIENCEGRID.ORG> Tell the FNAL Stratum-1 operators to go ahead and remove the repository. Response to Security Incident on an External Repository If there is a security incident on the publishing machine of an external repository and a publishing key is compromised, the fingerprint of that key should be added to /cvmfs/config-osg.opensciencegrid.org/etc/cvmfs/blacklist . In addition, another line should be added in the form <repository.name NNN with the repository name and a revision number that's one higher than the currently published revision. For more details see the cvmfs documentation on blacklisting .","title":"External OASIS repositories"},{"location":"external-oasis-repos/#external-oasis-repositories","text":"We offer hosting of non-OSG CVMFS repositories on OASIS. This means that requests to create, rename, remove, or blanking OASIS repositories will come in as GOC tickets. This document contains instructions for handling those tickets. Also see Policy for OSG Mirroring of External CVMFS repositories External OASIS repository","title":"External OASIS Repositories"},{"location":"external-oasis-repos/#requests-to-host-a-repository-on-oasis","text":"Ensure that the repository administrator is valid for the VO. This can be done by (a) OSG already having a relationship with the person or (b) the contacting the VO manager to find out. Also, the person should be listed in the OSG topology contacts list . Review provided URL and verify that it is appropriate for the VO and no other project uses it already. In order to make sure the name in URL is appropriate, check that the name is derived from the VO name or one of its projects. Then, add the repository URL to the topology for given VO under the OASISRepoURLs . This should cause the repository's configuration to be added to the OSG Stratum-0 within 15 minutes after URL is added into the topology. For example, if new URL is for the VO DUNE http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org edit the following under the OASIS section and create PR: git clone git://github.com/opensciencegrid/topology.git vim topology/virtual-organizations/DUNE.yaml ... OASIS: OASISRepoURLs: - http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org/ ... When the PR is approved, check on the oasis.opensciencegrid.org host whether the new repository was successfuly signed. There should be message about it in the log file /var/log/oasis/generate_whitelists.log : Tue Sep 25 17:34:02 2018 Running add_osg_repository http://hcc-cvmfs-repo.unl.edu:8000/cvmfs/dune.osgstorage.org dune.osgstorage.org: Signing 7 day whitelist with masterkeycard... done If the respository ends in a new domain name that has not been distributed before, a new domain key will be needed on oasis-replica which should get automatically downloaded from the etc/cvmfs/keys directory in the master branch of the config-repo github repository . There should be a message about downloading it in the log file /var/log/cvmfs/generate_replicas.log . After the key is downloaded the repository should also be automatically added, with messages in the same log file. After the repository is successfully on oasis-replica, in addition you need to update the OSG configuration repository. Make changes in a workspace cloned from the config-repo github repository and use the osg branch (or a branch made from it) in a personal account on oasis-itb . Add a domain configuration in etc/cvmfs/domain.d that's a lot like one of the other imported domains, for example egi.eu.conf . The server urls might be slightly different; use the URLs of the stratum 1s where it is already hosted if there are any, and you can add at least the FNAL and BNL stratum 1s. Copy key(s) for the domain into etc/cvmfs/keys from the master branch, either a single .pub file or a directory, whichever the master branch has. Test all these changes out on the config-osg.opensciencegrid.org repository on oasis-itb using the copy_config_osg command, and configure a test client to read from oasis-itb.opensciencegrid.org instead of oasis.opensciencegrid.org . Then commit those changes into a new branch you made from the osg branch, and make a pull request. Once that PR is approved and merged, log in to the oasis machine and run copy_config_osg as root there to copy from github to the production configuration repository on the oasis machine. If the repository name does not match *.opensciencegrid.org or *.osgstorage.org , skip this step and go on to your next step. If it does match one of those two patterns, then respond to the ticket to tell the administrator to continue with their next step (their step 4). We don't want them to continue before 15 minutes has elapsed after step 2 above, so either wait that much time or tell them the time they may proceed (15 minutes after you updated topology). Then wait until the admin has updated the ticket to indicate that they have completed their step before moving on. Ask the administrator of the BNL stratum 1 (John De Stefano) to also add the new repository. The BNL Stratum-1 administrator should set the service to read from http://oasis-replica.opensciencegrid.org:8002/cvmfs/<EXAMPLE.OPENSCIENCEGRID.ORG> . When the BNL Stratum-1 administrator has reported back that the replication is ready, respond to the requester that the repository is fully replicated on the OSG and close the ticket.","title":"Requests to Host a Repository on OASIS"},{"location":"external-oasis-repos/#requests-to-change-the-url-of-an-external-repository","text":"If there is a request to change the URL of an external repository, update the registered value in OASISRepoURLs for the respective VO in the topology. Tell the requester that it is ready 15 minutes after topology is updated.","title":"Requests to Change the URL of an External Repository"},{"location":"external-oasis-repos/#requests-to-remove-an-external-repository","text":"After validating that the ticket submitter is authorized by the VO's OASIS manager, delete the registered value for in topology for the VO in OASIS Repo URLs. Verify that it is removed by running the following on any oasis machine to make sure it is missing from the list: print_osg_repos|grep <EXAMPLE.OPENSCIENCEGRID.ORG> Check if the repository has been replicated to RAL by looking in their repositories.json . The user documentation requests the user to make a GGUS ticket to do this, so either ask them to do it or do it yourself. Add the BNL Stratum-1 operator (John De Stefano) to the ticket and ask him to remove the repository. Wait for him to finish before proceeding. Add the FNAL Stratum-1 operators (Merina Albert, Hyun Woo Kim) to the ticket and ask them when they can be ready to delete the repository. They can't remove it before it is removed from oasis-replica because their Stratum-1 automatically adds all repositories oasis-replica has. However, it has to be done within 8 hours of removal on oasis-replica or an alarm will start going off. Run the following command on oasis , oasis-itb , oasis-replica and oasis-replica-itb : remove_osg_repository -f <EXAMPLE.OPENSCIENCEGRID.ORG> Tell the FNAL Stratum-1 operators to go ahead and remove the repository.","title":"Requests to Remove an External Repository"},{"location":"external-oasis-repos/#response-to-security-incident-on-an-external-repository","text":"If there is a security incident on the publishing machine of an external repository and a publishing key is compromised, the fingerprint of that key should be added to /cvmfs/config-osg.opensciencegrid.org/etc/cvmfs/blacklist . In addition, another line should be added in the form <repository.name NNN with the repository name and a revision number that's one higher than the currently published revision. For more details see the cvmfs documentation on blacklisting .","title":"Response to Security Incident on an External Repository"},{"location":"SLA/access-point/","text":"Access Point Service Level Agreement Service Name(s) Access Point Description The Access Point is a HTCondor-based service that runs a submit host, and manages the queue of user jobs submitted to a HTCondor pool. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The Schedd is usually on a host that requires interactive logins for users. Access is generally provided via SSH. The Schedd authenticates with other daemons in the HTCondor pool with either GSI or IDTOKENS. Service Availability Availability Definition condor_q returns successfully The condor_schedd process is able to post a SchedD ad into its primary collector Target Availability: 95%","title":"Access Point"},{"location":"SLA/access-point/#access-point-service-level-agreement","text":"","title":"Access Point Service Level Agreement"},{"location":"SLA/access-point/#service-names","text":"Access Point","title":"Service Name(s)"},{"location":"SLA/access-point/#description","text":"The Access Point is a HTCondor-based service that runs a submit host, and manages the queue of user jobs submitted to a HTCondor pool.","title":"Description"},{"location":"SLA/access-point/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/access-point/#security-considerations","text":"The Schedd is usually on a host that requires interactive logins for users. Access is generally provided via SSH. The Schedd authenticates with other daemons in the HTCondor pool with either GSI or IDTOKENS.","title":"Security Considerations"},{"location":"SLA/access-point/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/access-point/#availability-definition","text":"condor_q returns successfully The condor_schedd process is able to post a SchedD ad into its primary collector","title":"Availability Definition"},{"location":"SLA/access-point/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/collector/","text":"CE Collector Service Level Agreement Service Name(s) CE Collector Description HTCondor Collector that advertises information about all of the HTCondor Compute Elements available for use on the OSG. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Effect of compromise of this service are limited to the service. A compromise may make the service unusable by clients trying to query it. Service Availability Availability Definition condor_status queries are successful Target Availability: 90%","title":"CE Collector"},{"location":"SLA/collector/#ce-collector-service-level-agreement","text":"","title":"CE Collector Service Level Agreement"},{"location":"SLA/collector/#service-names","text":"CE Collector","title":"Service Name(s)"},{"location":"SLA/collector/#description","text":"HTCondor Collector that advertises information about all of the HTCondor Compute Elements available for use on the OSG.","title":"Description"},{"location":"SLA/collector/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/collector/#security-considerations","text":"Effect of compromise of this service are limited to the service. A compromise may make the service unusable by clients trying to query it.","title":"Security Considerations"},{"location":"SLA/collector/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/collector/#availability-definition","text":"condor_status queries are successful","title":"Availability Definition"},{"location":"SLA/collector/#target-availability-90","text":"","title":"Target Availability: 90%"},{"location":"SLA/general/","text":"General Service Level Agreement Executive Summary This service level agreement (SLA) is between OSG and its Stakeholders, and applies to all centrally provided OSG production services. A specific SLA exists for each service type; it will list specific services covered by this type, provide a brief description, link to this document, provide specific security considerations, and define service-specific availability metrics. Owners This SLA is owned by OSG Operations and will be reviewed and agreed upon by the OSG Executive Team and service Stakeholders. The ultimate responsibility for abiding to the SLA lies with the OSG Executive Director and Technical Director. Service Target Response Priorities and Response Times This section deals with unplanned outages. Please see Planned Service Changes for information on planned maintenance outages. Severity Description Response Time Resolution Time Escalation Rate High The issue prevents any use of the service Within 4 business hours 2 business days 2 business days Elevated The issue prevents some acceptable uses Within 2 business days 5 business days 5 business days Normal The issue causes degraded performance Within 3 business days 30 business days 30 business days Services generally operate all the time, but support will be within business hours (9 a.m. to 5 p.m.) in the timezone of the host institution. The exception is for security incidents. Escalation Procedure Once a service can not be restored within the SLA Resolution Time specified, it automatically escalates. Subsequent levels escalate every period as defined in the Escalation rate in the response table above. Any issue starts at escalation level 1. Escalation Level OSG Contact 1st Service Owner 2nd OSG Operations Coordinator 3rd OSG Technical Director and Executive Director Any ongoing issues will be discussed at the weekly Operations and Production meetings. \"High\" and possibly \"Elevated\" level issues will in addition require more frequent meetings of the appropriate set of people to resolve the issue in a timely manner. Service Availability and Outages The Operations team will strive for monthly availability target percentages as declared in service specific SLAs, where availability is also defined per service. If service availability falls below monthly targets as monitored on two consecutive months, a root cause analysis and service plan will be submitted to the OSG Executive Team. This plan specifies actions to restore the service to the level of availability as specified in the SLA. Off-Hours Support Procedures OSG is presently not structured to commit to provide off-hours support. Accordingly, any off-hours support that may be provided is based on staff going beyond their agreed-upon responsibilities. Planned Service Changes The Operations team reserves the right to change the service based on its needs. No change will occur without notifying Stakeholders at least 5 business days in advance via the operations meeting and the operations email list. This includes both downtimes, as well as changes in functionality that do not substantially change the nature of the service. More substantial changes require more planning and longer lead times. Requests for Service Enhancements Stakeholders may request service enhancements via standard ticketing procedures. It is up to the Operations team to assess the impact of the requested changes and assign appropriate planning and review. Customer Problem Reporting Service problems should be reported immediately by opening a ticket: Either directly at https://support.opensciencegrid.org Or by emailing a description to help@opensciencegrid.org Responsibilities Customer Responsibilities Service customers agree to: Use the service as intended and only for OSG approved work. Alert the Operations team if they are going to use the service in a non-standard way, including testing or anticipated significant increases in usage. Contact the Operations team by means outlined in the Customer Problem Reporting section above if they encounter any service issues. Be willing and available to provide information in a timely manner consistent with the promised Resolution Times listed above. OSG Responsibilities Create and add appropriate documentation for appropriate use of the service. Meet response times associated with the priority assigned to Customer issues. Maintain appropriately trained staff. The OSG and Operations team are not responsible to meet target Resolution Times if a customer does not provide sufficient feedback. Log and track all customer requests for service through the OSG ticketing system. Announce planned changes to stakeholders in accordance with the Planned Service Changes section above and try to minimize adverse effects on stakeholders. SLA Change Procedure This SLA will remain valid unless a change or update is requested by the OSG Operations Coordinator, the OSG Executive Team, or Stakeholders. Any disagreements between OSG Executive Team and Stakeholders about desired changes or implementation of these SLAs will get resolved via the OSG Council.","title":"General"},{"location":"SLA/general/#general-service-level-agreement","text":"","title":"General Service Level Agreement"},{"location":"SLA/general/#executive-summary","text":"This service level agreement (SLA) is between OSG and its Stakeholders, and applies to all centrally provided OSG production services. A specific SLA exists for each service type; it will list specific services covered by this type, provide a brief description, link to this document, provide specific security considerations, and define service-specific availability metrics.","title":"Executive Summary"},{"location":"SLA/general/#owners","text":"This SLA is owned by OSG Operations and will be reviewed and agreed upon by the OSG Executive Team and service Stakeholders. The ultimate responsibility for abiding to the SLA lies with the OSG Executive Director and Technical Director.","title":"Owners"},{"location":"SLA/general/#service-target-response-priorities-and-response-times","text":"This section deals with unplanned outages. Please see Planned Service Changes for information on planned maintenance outages. Severity Description Response Time Resolution Time Escalation Rate High The issue prevents any use of the service Within 4 business hours 2 business days 2 business days Elevated The issue prevents some acceptable uses Within 2 business days 5 business days 5 business days Normal The issue causes degraded performance Within 3 business days 30 business days 30 business days Services generally operate all the time, but support will be within business hours (9 a.m. to 5 p.m.) in the timezone of the host institution. The exception is for security incidents.","title":"Service Target Response Priorities and Response Times"},{"location":"SLA/general/#escalation-procedure","text":"Once a service can not be restored within the SLA Resolution Time specified, it automatically escalates. Subsequent levels escalate every period as defined in the Escalation rate in the response table above. Any issue starts at escalation level 1. Escalation Level OSG Contact 1st Service Owner 2nd OSG Operations Coordinator 3rd OSG Technical Director and Executive Director Any ongoing issues will be discussed at the weekly Operations and Production meetings. \"High\" and possibly \"Elevated\" level issues will in addition require more frequent meetings of the appropriate set of people to resolve the issue in a timely manner.","title":"Escalation Procedure"},{"location":"SLA/general/#service-availability-and-outages","text":"The Operations team will strive for monthly availability target percentages as declared in service specific SLAs, where availability is also defined per service. If service availability falls below monthly targets as monitored on two consecutive months, a root cause analysis and service plan will be submitted to the OSG Executive Team. This plan specifies actions to restore the service to the level of availability as specified in the SLA.","title":"Service Availability and Outages"},{"location":"SLA/general/#off-hours-support-procedures","text":"OSG is presently not structured to commit to provide off-hours support. Accordingly, any off-hours support that may be provided is based on staff going beyond their agreed-upon responsibilities.","title":"Off-Hours Support Procedures"},{"location":"SLA/general/#planned-service-changes","text":"The Operations team reserves the right to change the service based on its needs. No change will occur without notifying Stakeholders at least 5 business days in advance via the operations meeting and the operations email list. This includes both downtimes, as well as changes in functionality that do not substantially change the nature of the service. More substantial changes require more planning and longer lead times.","title":"Planned Service Changes"},{"location":"SLA/general/#requests-for-service-enhancements","text":"Stakeholders may request service enhancements via standard ticketing procedures. It is up to the Operations team to assess the impact of the requested changes and assign appropriate planning and review.","title":"Requests for Service Enhancements"},{"location":"SLA/general/#customer-problem-reporting","text":"Service problems should be reported immediately by opening a ticket: Either directly at https://support.opensciencegrid.org Or by emailing a description to help@opensciencegrid.org","title":"Customer Problem Reporting"},{"location":"SLA/general/#responsibilities","text":"","title":"Responsibilities"},{"location":"SLA/general/#customer-responsibilities","text":"Service customers agree to: Use the service as intended and only for OSG approved work. Alert the Operations team if they are going to use the service in a non-standard way, including testing or anticipated significant increases in usage. Contact the Operations team by means outlined in the Customer Problem Reporting section above if they encounter any service issues. Be willing and available to provide information in a timely manner consistent with the promised Resolution Times listed above.","title":"Customer Responsibilities"},{"location":"SLA/general/#osg-responsibilities","text":"Create and add appropriate documentation for appropriate use of the service. Meet response times associated with the priority assigned to Customer issues. Maintain appropriately trained staff. The OSG and Operations team are not responsible to meet target Resolution Times if a customer does not provide sufficient feedback. Log and track all customer requests for service through the OSG ticketing system. Announce planned changes to stakeholders in accordance with the Planned Service Changes section above and try to minimize adverse effects on stakeholders.","title":"OSG Responsibilities"},{"location":"SLA/general/#sla-change-procedure","text":"This SLA will remain valid unless a change or update is requested by the OSG Operations Coordinator, the OSG Executive Team, or Stakeholders. Any disagreements between OSG Executive Team and Stakeholders about desired changes or implementation of these SLAs will get resolved via the OSG Council.","title":"SLA Change Procedure"},{"location":"SLA/gracc/","text":"GRACC Service Level Agreement Service Name(s) GRACC Frontend, GRACC Data Nodes, GRACC-APEL accounting Description GRACC is a large database of usage information for the OSG. GRACC includes the collector which receives the raw usage information from multiple sources, the database which stores, ingests, and retrieves the usage, and the Grafana frontend which visualizes the usage. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations GRACC uses several open source components such as ElasticSearch, Grafana, and Kibana. Each of these components publish security bulletins. Usernames / passwords are maintained within Grafana to edit the visualization. Additionally, the data in ElasticSearch is publically available read-only from the web. Service Availability Availability Definition Queries of various elasticsearch and gracc service statuses return active, web pages are accessible, certificates are valid. Target Availability: 95%","title":"GRACC"},{"location":"SLA/gracc/#gracc-service-level-agreement","text":"","title":"GRACC Service Level Agreement"},{"location":"SLA/gracc/#service-names","text":"GRACC Frontend, GRACC Data Nodes, GRACC-APEL accounting","title":"Service Name(s)"},{"location":"SLA/gracc/#description","text":"GRACC is a large database of usage information for the OSG. GRACC includes the collector which receives the raw usage information from multiple sources, the database which stores, ingests, and retrieves the usage, and the Grafana frontend which visualizes the usage.","title":"Description"},{"location":"SLA/gracc/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gracc/#security-considerations","text":"GRACC uses several open source components such as ElasticSearch, Grafana, and Kibana. Each of these components publish security bulletins. Usernames / passwords are maintained within Grafana to edit the visualization. Additionally, the data in ElasticSearch is publically available read-only from the web.","title":"Security Considerations"},{"location":"SLA/gracc/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gracc/#availability-definition","text":"Queries of various elasticsearch and gracc service statuses return active, web pages are accessible, certificates are valid.","title":"Availability Definition"},{"location":"SLA/gracc/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/gwms-factory/","text":"GWMS Factory Service Level Agreement Service Name(s) GWMS Factory Description The GlideinWMS Factory is responsible for serving GWMS Frontend requests and submitting pilots to compute sites on behalf of the respective science communities each Frontend supports based on user demand. Pilots are most typically submitted to sites through their respective Compute Entrypoints. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Frontends register to the Factory using HTCondor supported security mechanisms. The current supported authorization and authentication method is GSI based. A registered frontend is granted access to write requests in the form of HTCondor Classads to the Factory Collector. The factory receives submit credentials from the frontend through encrypted HTCondor classads; the factory can impersonate the identities from the frontend. The factory submits on a Frontend\u2019s behalf to Compute Entrypoints using either GSI or SciTokens. Service Availability Availability Definition Querying factory collector successfully returns, LastHeardFrom ad is < 6h from now Target Availability: 95%","title":"GWMS Factory"},{"location":"SLA/gwms-factory/#gwms-factory-service-level-agreement","text":"","title":"GWMS Factory Service Level Agreement"},{"location":"SLA/gwms-factory/#service-names","text":"GWMS Factory","title":"Service Name(s)"},{"location":"SLA/gwms-factory/#description","text":"The GlideinWMS Factory is responsible for serving GWMS Frontend requests and submitting pilots to compute sites on behalf of the respective science communities each Frontend supports based on user demand. Pilots are most typically submitted to sites through their respective Compute Entrypoints.","title":"Description"},{"location":"SLA/gwms-factory/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gwms-factory/#security-considerations","text":"Frontends register to the Factory using HTCondor supported security mechanisms. The current supported authorization and authentication method is GSI based. A registered frontend is granted access to write requests in the form of HTCondor Classads to the Factory Collector. The factory receives submit credentials from the frontend through encrypted HTCondor classads; the factory can impersonate the identities from the frontend. The factory submits on a Frontend\u2019s behalf to Compute Entrypoints using either GSI or SciTokens.","title":"Security Considerations"},{"location":"SLA/gwms-factory/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gwms-factory/#availability-definition","text":"Querying factory collector successfully returns, LastHeardFrom ad is < 6h from now","title":"Availability Definition"},{"location":"SLA/gwms-factory/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/gwms-frontend/","text":"GWMS Frontend Service Level Agreement Service Name(s) GWMS Frontend Description The GlideinWMS Frontend is responsible for monitoring a GWMS HTCondor pool, and submits pilot submission requests to the factory to compute sites based on user demand. There is usually one frontend per HTCondor Pool for a given scientific community. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The frontend generates credentials required to talk to the factory, manage the HTCondor pool, and submit to Compute Entrypoints. These can either be GSI, or IDTOKENS for pool credentials, and SciTokens for CE submission credentials. The Frontend forwards CE and pilot credentials to the factory by sending encrypted classads in the Factory Collector. Service Availability Availability Definition https://HOSTNAME/vofrontend/monitor/frontend_status.xml is accessible and updated time is < 20 min from now Target Availability: 95%","title":"GWMS Frontend"},{"location":"SLA/gwms-frontend/#gwms-frontend-service-level-agreement","text":"","title":"GWMS Frontend Service Level Agreement"},{"location":"SLA/gwms-frontend/#service-names","text":"GWMS Frontend","title":"Service Name(s)"},{"location":"SLA/gwms-frontend/#description","text":"The GlideinWMS Frontend is responsible for monitoring a GWMS HTCondor pool, and submits pilot submission requests to the factory to compute sites based on user demand. There is usually one frontend per HTCondor Pool for a given scientific community.","title":"Description"},{"location":"SLA/gwms-frontend/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/gwms-frontend/#security-considerations","text":"The frontend generates credentials required to talk to the factory, manage the HTCondor pool, and submit to Compute Entrypoints. These can either be GSI, or IDTOKENS for pool credentials, and SciTokens for CE submission credentials. The Frontend forwards CE and pilot credentials to the factory by sending encrypted classads in the Factory Collector.","title":"Security Considerations"},{"location":"SLA/gwms-frontend/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/gwms-frontend/#availability-definition","text":"https://HOSTNAME/vofrontend/monitor/frontend_status.xml is accessible and updated time is < 20 min from now","title":"Availability Definition"},{"location":"SLA/gwms-frontend/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/hosted-ce/","text":"Hosted CE Service Level Agreement Service Name(s) Hosted CE Description The Hosted CE (Compute Entrypoint) is a door into a computing site. Sites that prefer not to operate their own CE can request OSG staff to operate one on their behalf. Jobs submitted to CEs are typically pilot jobs submitted from systems such as GlideinWMS, which temporarily reserve compute resources at the site for a given scientific community. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The Hosted CE requires an SSH credential in order to access the headnode of a site batch system. The site creates dedicated generic user accounts for each scientific community that submits through the CE. Pilot jobs submitted to the CE are authenticated with either GSI or SciTokens. Service Availability Availability Definition Hostcert is valid The CE can be queried with condor_ce_q CE is reporting to Gratia if there are jobs in condor_history Target Availability: 95%","title":"Hosted CE"},{"location":"SLA/hosted-ce/#hosted-ce-service-level-agreement","text":"","title":"Hosted CE Service Level Agreement"},{"location":"SLA/hosted-ce/#service-names","text":"Hosted CE","title":"Service Name(s)"},{"location":"SLA/hosted-ce/#description","text":"The Hosted CE (Compute Entrypoint) is a door into a computing site. Sites that prefer not to operate their own CE can request OSG staff to operate one on their behalf. Jobs submitted to CEs are typically pilot jobs submitted from systems such as GlideinWMS, which temporarily reserve compute resources at the site for a given scientific community.","title":"Description"},{"location":"SLA/hosted-ce/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/hosted-ce/#security-considerations","text":"The Hosted CE requires an SSH credential in order to access the headnode of a site batch system. The site creates dedicated generic user accounts for each scientific community that submits through the CE. Pilot jobs submitted to the CE are authenticated with either GSI or SciTokens.","title":"Security Considerations"},{"location":"SLA/hosted-ce/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/hosted-ce/#availability-definition","text":"Hostcert is valid The CE can be queried with condor_ce_q CE is reporting to Gratia if there are jobs in condor_history","title":"Availability Definition"},{"location":"SLA/hosted-ce/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/htcss-central-manager/","text":"HTCSS Pool Central Manager Service Level Agreement Service Name(s) Central Manager Description The Central Manager is the HTCondor service that represents a HTCondor pool. It is the central database of available resources and users who submit to them. Submit hosts communicate user information to the Collector from the VO Schedds. When pilots or containers claim resources, they connect to the collector to describe the claimed machine. HTCondor can then match user jobs to resources using the normal matchmaking mechanisms. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Schedds and pilots are authenticated and authorized to write ads to the collector using GSI or IDTOKENS. Service Availability Availability Definition The collector responds to condor_status queries The negotiator process posts submitter priority summaries in the collector Target Availability: 95%","title":"HTCSS Central Manager"},{"location":"SLA/htcss-central-manager/#htcss-pool-central-manager-service-level-agreement","text":"","title":"HTCSS Pool Central Manager Service Level Agreement"},{"location":"SLA/htcss-central-manager/#service-names","text":"Central Manager","title":"Service Name(s)"},{"location":"SLA/htcss-central-manager/#description","text":"The Central Manager is the HTCondor service that represents a HTCondor pool. It is the central database of available resources and users who submit to them. Submit hosts communicate user information to the Collector from the VO Schedds. When pilots or containers claim resources, they connect to the collector to describe the claimed machine. HTCondor can then match user jobs to resources using the normal matchmaking mechanisms.","title":"Description"},{"location":"SLA/htcss-central-manager/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/htcss-central-manager/#security-considerations","text":"Schedds and pilots are authenticated and authorized to write ads to the collector using GSI or IDTOKENS.","title":"Security Considerations"},{"location":"SLA/htcss-central-manager/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/htcss-central-manager/#availability-definition","text":"The collector responds to condor_status queries The negotiator process posts submitter priority summaries in the collector","title":"Availability Definition"},{"location":"SLA/htcss-central-manager/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/message-broker/","text":"Message Broker Service Level Agreement Service Name(s) Message Broker Description This service allows short, text based messages to be exchanged between OSG operated computers. Uses include transport of accounting information to GRACC and network performance metrics to perfsonar. Service subscribers access queues to exchange data using point-to-point or publish and subscribe patterns. See the Wikipedia article on message queueing services for further discussion. The OSG uses a commercial vendor which provides their own SLA . However, their SLA does not cover \"overuse of resources,\" which is a common reason for downtime of the message broker. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All data transmitted to or by the service shall be considered disclosable without condition. Security of the message broker is handled by the commercial vendor. Accounts / passwords are administered by the OSG. Service Availability Availability Definition Messages can be successfully posted and received. Target Availability: 95%","title":"Message Broker"},{"location":"SLA/message-broker/#message-broker-service-level-agreement","text":"","title":"Message Broker Service Level Agreement"},{"location":"SLA/message-broker/#service-names","text":"Message Broker","title":"Service Name(s)"},{"location":"SLA/message-broker/#description","text":"This service allows short, text based messages to be exchanged between OSG operated computers. Uses include transport of accounting information to GRACC and network performance metrics to perfsonar. Service subscribers access queues to exchange data using point-to-point or publish and subscribe patterns. See the Wikipedia article on message queueing services for further discussion. The OSG uses a commercial vendor which provides their own SLA . However, their SLA does not cover \"overuse of resources,\" which is a common reason for downtime of the message broker.","title":"Description"},{"location":"SLA/message-broker/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/message-broker/#security-considerations","text":"All data transmitted to or by the service shall be considered disclosable without condition. Security of the message broker is handled by the commercial vendor. Accounts / passwords are administered by the OSG.","title":"Security Considerations"},{"location":"SLA/message-broker/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/message-broker/#availability-definition","text":"Messages can be successfully posted and received.","title":"Availability Definition"},{"location":"SLA/message-broker/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/oasis/","text":"OASIS Repository Service Level Agreement Service Name(s) OASIS Stratum 0, Oasis Stratum 1, OASIS Login Description The OASIS service provides users with a central location for application software. The content hosted on OASIS can be made available on OSG compute resources. The service consists of three virtual machines, a stratum 0 CERN Virtual Machine File System (CVMFS) server, a stratum 1 replica of the stratum 0 and a node accessible for login by OASIS managers. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The OASIS stratum 0 server and OASIS stratum 1 are accessible only by OSG Operations and Technology group staff. The OASIS interactive node (oasis-login.opensciencegrid.org) is accessible by OSG Operations and Technology group staff and to registered software managers via gsissh. A person becomes a software manager only by explicit approval of OSG staff within the Topology Database. The OASIS stratum 1 is publicly available as read-only. Service Availability Availability Definition CVMFS status returns OK, OASIS stamp status returns OK, CVMFS repo status return OK, service queries return active Target Availability: 95%","title":"OASIS"},{"location":"SLA/oasis/#oasis-repository-service-level-agreement","text":"","title":"OASIS Repository Service Level Agreement"},{"location":"SLA/oasis/#service-names","text":"OASIS Stratum 0, Oasis Stratum 1, OASIS Login","title":"Service Name(s)"},{"location":"SLA/oasis/#description","text":"The OASIS service provides users with a central location for application software. The content hosted on OASIS can be made available on OSG compute resources. The service consists of three virtual machines, a stratum 0 CERN Virtual Machine File System (CVMFS) server, a stratum 1 replica of the stratum 0 and a node accessible for login by OASIS managers.","title":"Description"},{"location":"SLA/oasis/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/oasis/#security-considerations","text":"The OASIS stratum 0 server and OASIS stratum 1 are accessible only by OSG Operations and Technology group staff. The OASIS interactive node (oasis-login.opensciencegrid.org) is accessible by OSG Operations and Technology group staff and to registered software managers via gsissh. A person becomes a software manager only by explicit approval of OSG staff within the Topology Database. The OASIS stratum 1 is publicly available as read-only.","title":"Security Considerations"},{"location":"SLA/oasis/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/oasis/#availability-definition","text":"CVMFS status returns OK, OASIS stamp status returns OK, CVMFS repo status return OK, service queries return active","title":"Availability Definition"},{"location":"SLA/oasis/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-cache/","text":"OSDF Cache Service Level Agreement Service Name(s) ODSF (Open Science Data Federation) Cache Description Clients contact the closest Cache to access data from the OSDF federation. This server caches fetched files to reduce transfer overhead over the WAN General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations When clients use SciTokens to authenticate with the cache, the bearer token is passed to the cache (which may subsequently use it to impersonate the client to communicate with the origin). Service Availability Availability Definition Can successfully read a test file through the Cache server Target Availability: 95%","title":"OSDF Cache"},{"location":"SLA/osdf-cache/#osdf-cache-service-level-agreement","text":"","title":"OSDF Cache Service Level Agreement"},{"location":"SLA/osdf-cache/#service-names","text":"ODSF (Open Science Data Federation) Cache","title":"Service Name(s)"},{"location":"SLA/osdf-cache/#description","text":"Clients contact the closest Cache to access data from the OSDF federation. This server caches fetched files to reduce transfer overhead over the WAN","title":"Description"},{"location":"SLA/osdf-cache/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-cache/#security-considerations","text":"When clients use SciTokens to authenticate with the cache, the bearer token is passed to the cache (which may subsequently use it to impersonate the client to communicate with the origin).","title":"Security Considerations"},{"location":"SLA/osdf-cache/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-cache/#availability-definition","text":"Can successfully read a test file through the Cache server","title":"Availability Definition"},{"location":"SLA/osdf-cache/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-core/","text":"OSDF Core Service Level Agreement Service Name(s) ODSF (Open Science Data Federation) Redirector, ODSF Monitor Collector, ODSF Shoveler Description The Redirector routes cache requests to be served by the respective data origin that contains the requested file. The Monitor is used to collect transfer accounting data. The Shoveler forwards transfer accounting data from the OSG Message Bus to the WLCG Message Bus General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations If the client uses a SCITOKEN to retrieve a file, the cache may contact the redirector with the SCITOKEN to determine the origin that holds the file. The Monitoring Collector receives messages from XRootD caches that may include DNs, though not the actual certificates and not SCITOKENS. The monitoring collector has credentials to talk to the OSG Message Bus. The shoveler has credentials to talk to the OSG Message Bus, and the WLCG Message Bus. Service Availability Availability Definition Redirector: Able to redirect a known file Monitoring Collector: Outgoing queues in message bus is above 1/sec, or prometheus endpoint is responding Shoveler: Message bus queue for ingestion by the shoveler stays beneath 10,000 queued messages Target Availability: 95%","title":"OSDF Core"},{"location":"SLA/osdf-core/#osdf-core-service-level-agreement","text":"","title":"OSDF Core Service Level Agreement"},{"location":"SLA/osdf-core/#service-names","text":"ODSF (Open Science Data Federation) Redirector, ODSF Monitor Collector, ODSF Shoveler","title":"Service Name(s)"},{"location":"SLA/osdf-core/#description","text":"The Redirector routes cache requests to be served by the respective data origin that contains the requested file. The Monitor is used to collect transfer accounting data. The Shoveler forwards transfer accounting data from the OSG Message Bus to the WLCG Message Bus","title":"Description"},{"location":"SLA/osdf-core/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-core/#security-considerations","text":"If the client uses a SCITOKEN to retrieve a file, the cache may contact the redirector with the SCITOKEN to determine the origin that holds the file. The Monitoring Collector receives messages from XRootD caches that may include DNs, though not the actual certificates and not SCITOKENS. The monitoring collector has credentials to talk to the OSG Message Bus. The shoveler has credentials to talk to the OSG Message Bus, and the WLCG Message Bus.","title":"Security Considerations"},{"location":"SLA/osdf-core/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-core/#availability-definition","text":"Redirector: Able to redirect a known file Monitoring Collector: Outgoing queues in message bus is above 1/sec, or prometheus endpoint is responding Shoveler: Message bus queue for ingestion by the shoveler stays beneath 10,000 queued messages","title":"Availability Definition"},{"location":"SLA/osdf-core/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/osdf-origin/","text":"OSDF Origin Service Level Agreement Service Name(s) OSDF (Open Science Data Federation) Origin Description Server that holds the data in the OSDF federation for a particular organization General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations When clients use SciTokens to authenticate with the origin, the bearer token is passed to the origin from the cache Service Availability Availability Definition Can successfully read a test file Target Availability: 95%","title":"OSDF Origin"},{"location":"SLA/osdf-origin/#osdf-origin-service-level-agreement","text":"","title":"OSDF Origin Service Level Agreement"},{"location":"SLA/osdf-origin/#service-names","text":"OSDF (Open Science Data Federation) Origin","title":"Service Name(s)"},{"location":"SLA/osdf-origin/#description","text":"Server that holds the data in the OSDF federation for a particular organization","title":"Description"},{"location":"SLA/osdf-origin/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/osdf-origin/#security-considerations","text":"When clients use SciTokens to authenticate with the origin, the bearer token is passed to the origin from the cache","title":"Security Considerations"},{"location":"SLA/osdf-origin/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/osdf-origin/#availability-definition","text":"Can successfully read a test file","title":"Availability Definition"},{"location":"SLA/osdf-origin/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/perfsonar/","text":"PerfSonar Service Level Agreement Service Name(s) PerfSonar Components Description A preliminary description of the service is available here . The covered components are those responsible for the data pipeline iteslf, which are the PS Collector and PS Config. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All information collected and distributed by PerfSonar is public. Service Availability Availability Definition Hosts are up, U Michigan provided tests run successfully Target Availability: 95%","title":"PerfSonar"},{"location":"SLA/perfsonar/#perfsonar-service-level-agreement","text":"","title":"PerfSonar Service Level Agreement"},{"location":"SLA/perfsonar/#service-names","text":"PerfSonar Components","title":"Service Name(s)"},{"location":"SLA/perfsonar/#description","text":"A preliminary description of the service is available here . The covered components are those responsible for the data pipeline iteslf, which are the PS Collector and PS Config.","title":"Description"},{"location":"SLA/perfsonar/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/perfsonar/#security-considerations","text":"All information collected and distributed by PerfSonar is public.","title":"Security Considerations"},{"location":"SLA/perfsonar/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/perfsonar/#availability-definition","text":"Hosts are up, U Michigan provided tests run successfully","title":"Availability Definition"},{"location":"SLA/perfsonar/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/software-repo/","text":"Software Repository Service Level Agreement Service Name(s) Software Repository, GridCF Repository Description The OSG RPM Software Repository hold files necessary to update the OSG CA distribution and current OSG Production and ITB Middleware stacks. This service consists of an NGINX Server and a mirroring mechanism, based on rsync and mash, both the web server and mirroring are essential to the operation of the Repository. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations The repository is read only by web access, and only Operations staff has access to change or modify the Software Repository. Service Availability Availability Definition osg-release rpm is accessible Target Availability: 95%","title":"Software Repository"},{"location":"SLA/software-repo/#software-repository-service-level-agreement","text":"","title":"Software Repository Service Level Agreement"},{"location":"SLA/software-repo/#service-names","text":"Software Repository, GridCF Repository","title":"Service Name(s)"},{"location":"SLA/software-repo/#description","text":"The OSG RPM Software Repository hold files necessary to update the OSG CA distribution and current OSG Production and ITB Middleware stacks. This service consists of an NGINX Server and a mirroring mechanism, based on rsync and mash, both the web server and mirroring are essential to the operation of the Repository.","title":"Description"},{"location":"SLA/software-repo/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/software-repo/#security-considerations","text":"The repository is read only by web access, and only Operations staff has access to change or modify the Software Repository.","title":"Security Considerations"},{"location":"SLA/software-repo/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/software-repo/#availability-definition","text":"osg-release rpm is accessible","title":"Availability Definition"},{"location":"SLA/software-repo/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/topology/","text":"Topology Service Level Agreement Service Name(s) Topology, Topology Web service Description The Topology service holds information about people and resources involved in the OSG. Projects, site resources and downtime information are stored in flat text files in a public GitHub repository. Private data such as contact information is stored in a private repository. The Topology Web service is a user facing web page that exports topology data. General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations Users are required to have a GitHub account and must be whitelisted by OSG Technology group staff in order to request changes in the public repository. Changes to registration are reviewed and applied by OSG staff. Only a limited number of OSG Technology staff members have read and write access to the private contact repository. Data exported by topology web service publicly readable, but x509 certificate is required to view private contact details (email, phone). Service Availability Availability Definition https://topology.opensciencegrid.org/rgdowntime/xml is accessible Target Availability: 95%","title":"Topology"},{"location":"SLA/topology/#topology-service-level-agreement","text":"","title":"Topology Service Level Agreement"},{"location":"SLA/topology/#service-names","text":"Topology, Topology Web service","title":"Service Name(s)"},{"location":"SLA/topology/#description","text":"The Topology service holds information about people and resources involved in the OSG. Projects, site resources and downtime information are stored in flat text files in a public GitHub repository. Private data such as contact information is stored in a private repository. The Topology Web service is a user facing web page that exports topology data.","title":"Description"},{"location":"SLA/topology/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/topology/#security-considerations","text":"Users are required to have a GitHub account and must be whitelisted by OSG Technology group staff in order to request changes in the public repository. Changes to registration are reviewed and applied by OSG staff. Only a limited number of OSG Technology staff members have read and write access to the private contact repository. Data exported by topology web service publicly readable, but x509 certificate is required to view private contact details (email, phone).","title":"Security Considerations"},{"location":"SLA/topology/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/topology/#availability-definition","text":"https://topology.opensciencegrid.org/rgdowntime/xml is accessible","title":"Availability Definition"},{"location":"SLA/topology/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/web-pages/","text":"Web Pages Service Level Agreement Service Name(s) Display, Map, OSG Connect web pages Description User facing web pages that display usage statistics, and homepage for OSG Connect General Service Level Agreement https://osg-htc.org/operations/SLA/general/ Security Considerations All information in the Display and Map is publicly readable. Write access is not allowed for non OSG staff. Service Availability Availability Definition Test web pages are accessible, certificates are valid for pages requiring them Target Availability: 95%","title":"Web Pages"},{"location":"SLA/web-pages/#web-pages-service-level-agreement","text":"","title":"Web Pages Service Level Agreement"},{"location":"SLA/web-pages/#service-names","text":"Display, Map, OSG Connect web pages","title":"Service Name(s)"},{"location":"SLA/web-pages/#description","text":"User facing web pages that display usage statistics, and homepage for OSG Connect","title":"Description"},{"location":"SLA/web-pages/#general-service-level-agreement","text":"https://osg-htc.org/operations/SLA/general/","title":"General Service Level Agreement"},{"location":"SLA/web-pages/#security-considerations","text":"All information in the Display and Map is publicly readable. Write access is not allowed for non OSG staff.","title":"Security Considerations"},{"location":"SLA/web-pages/#service-availability","text":"","title":"Service Availability"},{"location":"SLA/web-pages/#availability-definition","text":"Test web pages are accessible, certificates are valid for pages requiring them","title":"Availability Definition"},{"location":"SLA/web-pages/#target-availability-95","text":"","title":"Target Availability: 95%"},{"location":"SLA/xdlogin/","text":"XD-Login Service Level Agreement Version Control Version Number Date Author Comments 0.1 2-18-2013 Scott Teige First Draft Executive Summary This SLA is an agreement between OSG Operations, Information Sciences Institute (ISI) and the OSG Management and Stakeholders describing details of the OSG-XD job submission system. Owners This SLA is owned by OSG Operations, Indiana University and ISI and will be reviewed and agreed upon by the OSG Executive Team and OSG Stakeholders. Service Name and Description Name The service is built on two physical machines hosted by the GOC. These are the production XD-Login job submission node and an adjunct machine, OSG-FLOCK. Description The XD-Login submission node allows OSG resources to be accessed by XSEDE users. A complete description is available here. Security Considerations The XD-Login submission node supports Single Sign On through the XSEDE User Portal, and also from the command line using gsissh with a grid certificate for authentication. OSG-FLOCK is not publicly accesible. Service Target Response Priorities and Response Times This section deals with unplanned outages. Please see Requests for Service Enhancement for information on planned maintenance outages. Critical High Elevated Normal Work Outage * * * * * * This Service does not have critical priority The issue causes a full service outage rendering the service unavailable The issue causes short (less than 15 minute) periods of unstable or inconsistent performance The issue causes minor (less than 5 minutes) periods of unstable or inconsistent performance Number of Clients Affected * * * * * * N/A The issue affects all users The issue may or may not affect all users The issue affects only a small number of users Response Time * * * * * * N/A Within the next business day Within the next business day Within five (5) business days Resolution Time * * * * * * N/A The maximum acceptable resolution time is one full (1) business day The maximum acceptable resolution time is five (5) business days The maximum acceptable resolution time is thirty (30) business days Escalates Every * * * * * * N/A One Day One Week One Month Escalation Contacts Escalation Level OSG Contact 1st OSG Operations Infrastructure Lead 2nd OSG Operations Coordinator 3rd OSG Production Coordinator 4th OSG Technical Director and Executive Director Detailed information on contacts are viewable on the following MyOSG URL , and are maintained within the Any ongoing \"Normal\" or \"Elevated\" level issues will be discussed at the weekly Operations and Production ) meetings. Service Availability and Outages The GOC will strive for 97% service availability. If service availability falls below 97% monthly as monitored by the GOC on two consecutive months a root cause analysis and service plan will be submitted to the OSG stakeholders for plans to restore an acceptable level of service availability. Service Support Hours The service is supported 24x7 by the GOC and Indiana University. All issues will be investigated by the next business day. Service Off-Hours Support Procedures XSEDE users should report problems via the XSEDE trouble ticket system. Others should contact the GOC via the Requests for Service Enhancements This section deals with planned maintenance outages. Please see Service Target Response Priorities and Response Times for information on unplanned outages. The OSG Operations will respond to customer requests for service enhancements based on GOC determination of the necessity and desirability of the enhancement. The GOC reserves the right to enhance the physical environment of the service based on IU and GOC needs. No enhancement will occur without advanced notice to the OSG community. Customer Problem Reporting The GOC provides operators 24x7x365. Service problems should be reported immediately by one of the following mechanisms. Creating a problem ticket at https://tickets.xsede.org ( preferred for XSEDE users ) Creating a problem ticket at https://ticket.grid.iu.edu/goc/submit ( preferred for other users ) Emailing a description to XD-Login-support@opensciencegrid.org Calling the GOC phone at 317-278-9699 Responsibilities Customer Responsibilities XD-Login customers agree to: Use the service for purposes of XSEDE or OSG approved work only. Alert the GOC if they are going to use the Service in a non-standard way, this includes testing or anticipated mass increases in usage. Contact support by means outlined in the Customer Problem Reporting section of this document if they encounter any service issues. Be willing and available to provide information within one business day for any High level issues reported. OSG Operations Responsibilities GOC operations: * Maintain the physical machine hosting the service * Assure the service is accessible via its advertised URL * Make changes and updates within the normal GOC release schedule * Meet response times associated with the priority assigned by users for issues related to the hardware. * Maintain appropriately trained staff. * The OSG and GOC are not responsible if a customer does not provide testing during the testing period. In such cases, the GOC has final discretion in what remedial actions to take. ISI: * Is responsible for all aspects of the service not listed above. GOC Service Desk Responsibilities: * Log and track all Customer requests for service through the OSG ticketing system. Database & Application Services responsibilities: * Announce and negotiate maintenance with stakeholders to assure minimal interruption to normal workload. * Alert the community of scheduled maintenance periods at least five (5) business days prior to the start of a service affecting maintenance window. Service Measuring and Reporting The GOC will provide the customer with the following reports in the intervals indicated (monthly, quarterly, semi-annually, or annually): Report Name Reporting Interval Delivery Method Responsible Party System Uptime Monthly Web Posting GOC Service Uptime Monthly Web Posting GOC Report of Critical and High Priority Issues Quarterly Web Posting GOC These reports will be posted in Appendix E of this document. SLA Validity Period This SLA will be in affect for one year. SLA Review Procedure This SLA will renew automatically on a yearly basis unless change or update is requested by the OSG Operations Coordinator, a representitive of ISI, the OSG Executive Team or the Stakeholders. References Appendix A - Customer Information All service end-users who are members of an OSG VO and OSG Staff are considered customers. All XSEDE staff and all XSEDE users with an OSG allocation are also customers. Appendix B - Other Service Dependencies The service is dependent on the following services to collect and distribute information: * Local Network and Hardware Appendix C - Supported Hardware and Software Supported Hardware The following hardware is supported: * Physical devices used to provide the service. * Physical devices used to provide the environment used to house the service. Hardware Services The following hardware services are provided: * Recommendations. OSG Operations will be responsible for specifying and recommending for purchase or lease hardware meeting customers' needs. * Installation. OSG Operations will install, configure and customize system hardware and operating systems. * Upgrades. OSG Operations is responsible for specifying and recommending for purchase any hardware upgrades. * Diagnosis. OSG Operations will diagnose problems with service related hardware. * Repair. OSG Operations analysts are not hardware technicians and receive no training in hardware maintenance, nor do we have the test equipment and tools necessary to do such work. Performing repairs under warranty: Any work to be performed under warranty may be referred to the warranty service provider at the discretion of the Service Provider analyst(s). Service Provider analysts will not undertake work that will void warranties on customer hardware unless specifically requested and authorized by customer's management in writing. Obtaining repair services: The Service Provider analyst will recommend a service vendor whenever he/she feels the repair work requires specialized skills or tools. Backup. Service Provider agrees to fully back up all Service Provider-supported software and data nightly every business day. Software Services Service Provider agrees to cover software support services, including software installations and upgrades. All software maintenance periods will be announced via the policy put forth in the OSG Operations Responsibilities section of this document. Software Costs IU and the Grid Operations Center bears all costs for new and replacement software. Appendix D - Approval Approved By Position Date Appendix E - Metric Reports [[ServiceLevelAgreements#Supporting_Documents][Recent availability statistics]]","title":"XDLogin"},{"location":"SLA/xdlogin/#xd-login-service-level-agreement","text":"","title":"XD-Login Service Level Agreement"},{"location":"SLA/xdlogin/#version-control","text":"Version Number Date Author Comments 0.1 2-18-2013 Scott Teige First Draft","title":"Version Control"},{"location":"SLA/xdlogin/#executive-summary","text":"This SLA is an agreement between OSG Operations, Information Sciences Institute (ISI) and the OSG Management and Stakeholders describing details of the OSG-XD job submission system.","title":"Executive Summary"},{"location":"SLA/xdlogin/#owners","text":"This SLA is owned by OSG Operations, Indiana University and ISI and will be reviewed and agreed upon by the OSG Executive Team and OSG Stakeholders.","title":"Owners"},{"location":"SLA/xdlogin/#service-name-and-description","text":"","title":"Service Name and Description"},{"location":"SLA/xdlogin/#name","text":"The service is built on two physical machines hosted by the GOC. These are the production XD-Login job submission node and an adjunct machine, OSG-FLOCK.","title":"Name"},{"location":"SLA/xdlogin/#description","text":"The XD-Login submission node allows OSG resources to be accessed by XSEDE users. A complete description is available here.","title":"Description"},{"location":"SLA/xdlogin/#security-considerations","text":"The XD-Login submission node supports Single Sign On through the XSEDE User Portal, and also from the command line using gsissh with a grid certificate for authentication. OSG-FLOCK is not publicly accesible.","title":"Security Considerations"},{"location":"SLA/xdlogin/#service-target-response-priorities-and-response-times","text":"This section deals with unplanned outages. Please see Requests for Service Enhancement for information on planned maintenance outages. Critical High Elevated Normal Work Outage * * * * * * This Service does not have critical priority The issue causes a full service outage rendering the service unavailable The issue causes short (less than 15 minute) periods of unstable or inconsistent performance The issue causes minor (less than 5 minutes) periods of unstable or inconsistent performance Number of Clients Affected * * * * * * N/A The issue affects all users The issue may or may not affect all users The issue affects only a small number of users Response Time * * * * * * N/A Within the next business day Within the next business day Within five (5) business days Resolution Time * * * * * * N/A The maximum acceptable resolution time is one full (1) business day The maximum acceptable resolution time is five (5) business days The maximum acceptable resolution time is thirty (30) business days Escalates Every * * * * * * N/A One Day One Week One Month","title":"Service Target Response Priorities and Response Times"},{"location":"SLA/xdlogin/#escalation-contacts","text":"Escalation Level OSG Contact 1st OSG Operations Infrastructure Lead 2nd OSG Operations Coordinator 3rd OSG Production Coordinator 4th OSG Technical Director and Executive Director Detailed information on contacts are viewable on the following MyOSG URL , and are maintained within the Any ongoing \"Normal\" or \"Elevated\" level issues will be discussed at the weekly Operations and Production ) meetings.","title":"Escalation Contacts"},{"location":"SLA/xdlogin/#service-availability-and-outages","text":"The GOC will strive for 97% service availability. If service availability falls below 97% monthly as monitored by the GOC on two consecutive months a root cause analysis and service plan will be submitted to the OSG stakeholders for plans to restore an acceptable level of service availability.","title":"Service Availability and Outages"},{"location":"SLA/xdlogin/#service-support-hours","text":"The service is supported 24x7 by the GOC and Indiana University. All issues will be investigated by the next business day.","title":"Service Support Hours"},{"location":"SLA/xdlogin/#service-off-hours-support-procedures","text":"XSEDE users should report problems via the XSEDE trouble ticket system. Others should contact the GOC via the","title":"Service Off-Hours Support Procedures"},{"location":"SLA/xdlogin/#requests-for-service-enhancements","text":"This section deals with planned maintenance outages. Please see Service Target Response Priorities and Response Times for information on unplanned outages. The OSG Operations will respond to customer requests for service enhancements based on GOC determination of the necessity and desirability of the enhancement. The GOC reserves the right to enhance the physical environment of the service based on IU and GOC needs. No enhancement will occur without advanced notice to the OSG community.","title":"Requests for Service Enhancements"},{"location":"SLA/xdlogin/#customer-problem-reporting","text":"The GOC provides operators 24x7x365. Service problems should be reported immediately by one of the following mechanisms. Creating a problem ticket at https://tickets.xsede.org ( preferred for XSEDE users ) Creating a problem ticket at https://ticket.grid.iu.edu/goc/submit ( preferred for other users ) Emailing a description to XD-Login-support@opensciencegrid.org Calling the GOC phone at 317-278-9699","title":"Customer Problem Reporting"},{"location":"SLA/xdlogin/#responsibilities","text":"","title":"Responsibilities"},{"location":"SLA/xdlogin/#customer-responsibilities","text":"XD-Login customers agree to: Use the service for purposes of XSEDE or OSG approved work only. Alert the GOC if they are going to use the Service in a non-standard way, this includes testing or anticipated mass increases in usage. Contact support by means outlined in the Customer Problem Reporting section of this document if they encounter any service issues. Be willing and available to provide information within one business day for any High level issues reported.","title":"Customer Responsibilities"},{"location":"SLA/xdlogin/#osg-operations-responsibilities","text":"GOC operations: * Maintain the physical machine hosting the service * Assure the service is accessible via its advertised URL * Make changes and updates within the normal GOC release schedule * Meet response times associated with the priority assigned by users for issues related to the hardware. * Maintain appropriately trained staff. * The OSG and GOC are not responsible if a customer does not provide testing during the testing period. In such cases, the GOC has final discretion in what remedial actions to take. ISI: * Is responsible for all aspects of the service not listed above. GOC Service Desk Responsibilities: * Log and track all Customer requests for service through the OSG ticketing system. Database & Application Services responsibilities: * Announce and negotiate maintenance with stakeholders to assure minimal interruption to normal workload. * Alert the community of scheduled maintenance periods at least five (5) business days prior to the start of a service affecting maintenance window.","title":"OSG Operations Responsibilities"},{"location":"SLA/xdlogin/#service-measuring-and-reporting","text":"The GOC will provide the customer with the following reports in the intervals indicated (monthly, quarterly, semi-annually, or annually): Report Name Reporting Interval Delivery Method Responsible Party System Uptime Monthly Web Posting GOC Service Uptime Monthly Web Posting GOC Report of Critical and High Priority Issues Quarterly Web Posting GOC These reports will be posted in Appendix E of this document.","title":"Service Measuring and Reporting"},{"location":"SLA/xdlogin/#sla-validity-period","text":"This SLA will be in affect for one year.","title":"SLA Validity Period"},{"location":"SLA/xdlogin/#sla-review-procedure","text":"This SLA will renew automatically on a yearly basis unless change or update is requested by the OSG Operations Coordinator, a representitive of ISI, the OSG Executive Team or the Stakeholders.","title":"SLA Review Procedure"},{"location":"SLA/xdlogin/#references","text":"","title":"References"},{"location":"SLA/xdlogin/#appendix-a-customer-information","text":"All service end-users who are members of an OSG VO and OSG Staff are considered customers. All XSEDE staff and all XSEDE users with an OSG allocation are also customers.","title":"Appendix A - Customer Information"},{"location":"SLA/xdlogin/#appendix-b-other-service-dependencies","text":"The service is dependent on the following services to collect and distribute information: * Local Network and Hardware","title":"Appendix B - Other Service Dependencies"},{"location":"SLA/xdlogin/#appendix-c-supported-hardware-and-software","text":"","title":"Appendix C - Supported Hardware and Software"},{"location":"SLA/xdlogin/#supported-hardware","text":"The following hardware is supported: * Physical devices used to provide the service. * Physical devices used to provide the environment used to house the service.","title":"Supported Hardware"},{"location":"SLA/xdlogin/#hardware-services","text":"The following hardware services are provided: * Recommendations. OSG Operations will be responsible for specifying and recommending for purchase or lease hardware meeting customers' needs. * Installation. OSG Operations will install, configure and customize system hardware and operating systems. * Upgrades. OSG Operations is responsible for specifying and recommending for purchase any hardware upgrades. * Diagnosis. OSG Operations will diagnose problems with service related hardware. * Repair. OSG Operations analysts are not hardware technicians and receive no training in hardware maintenance, nor do we have the test equipment and tools necessary to do such work. Performing repairs under warranty: Any work to be performed under warranty may be referred to the warranty service provider at the discretion of the Service Provider analyst(s). Service Provider analysts will not undertake work that will void warranties on customer hardware unless specifically requested and authorized by customer's management in writing. Obtaining repair services: The Service Provider analyst will recommend a service vendor whenever he/she feels the repair work requires specialized skills or tools. Backup. Service Provider agrees to fully back up all Service Provider-supported software and data nightly every business day.","title":"Hardware Services"},{"location":"SLA/xdlogin/#software-services","text":"Service Provider agrees to cover software support services, including software installations and upgrades. All software maintenance periods will be announced via the policy put forth in the OSG Operations Responsibilities section of this document.","title":"Software Services"},{"location":"SLA/xdlogin/#software-costs","text":"IU and the Grid Operations Center bears all costs for new and replacement software.","title":"Software Costs"},{"location":"SLA/xdlogin/#appendix-d-approval","text":"Approved By Position Date","title":"Appendix D - Approval"},{"location":"SLA/xdlogin/#appendix-e-metric-reports","text":"[[ServiceLevelAgreements#Supporting_Documents][Recent availability statistics]]","title":"Appendix E - Metric Reports"},{"location":"ServiceManagement/ResponsibilityMatrix/","text":"Responsibility Matrix Service Owner Support Address Area Coordinator Contact Service Catalog Service Portfolio Display IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link Internal Link GlideIn-Factory UCSD help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link GRACC UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Link Messaging Service UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Hosted-CE UC user-support@opensciencegrid.org Rob Gardner (rwg@uchicago.edu) Link OIM IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link CVMFS Stratum-1 UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) CVMFS Repositories (*.osgstorage.org) UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu)","title":"Responsibility Matrix"},{"location":"ServiceManagement/ResponsibilityMatrix/#responsibility-matrix","text":"Service Owner Support Address Area Coordinator Contact Service Catalog Service Portfolio Display IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link Internal Link GlideIn-Factory UCSD help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link GRACC UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Link Messaging Service UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) Hosted-CE UC user-support@opensciencegrid.org Rob Gardner (rwg@uchicago.edu) Link OIM IU-HTC help@opensciencegrid.org Rob Quick (rquick@iu.edu) Link CVMFS Stratum-1 UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu) CVMFS Repositories (*.osgstorage.org) UNL-HCC help@opensciencegrid.org Brian Bockelman (bbockelm@cse.unl.edu)","title":"Responsibility Matrix"},{"location":"ServiceManagement/placeholder/","text":"Placeholder file.","title":"Placeholder"},{"location":"ServiceManagement/ServiceCatalog/SC_Display/","text":"Service Catalog for the OSG Display Service Service Catalog OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM","title":"Service Catalog for the OSG Display Service"},{"location":"ServiceManagement/ServiceCatalog/SC_Display/#service-catalog-for-the-osg-display-service","text":"Service Catalog OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM","title":"Service Catalog for the OSG Display Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GRACC/","text":"Service Catalog for the OSG GRACC (Accounting) Service Service Catalog OSG GRACC Service Name OSG GRACC Service Status Active Service Description GRACC serves as a repository for OSG accounting and monitoring data, integrating disparate data sources such as job accounting, transfer accounting, and network performance monitoring. Service Users WLCG accounting, OSG management, network engineers, and VOs wanting to monitor resource usage Service Area OSG Technology Service Level Agreement In progress Support Unit OSG Operations and Technology Support Contact Address help@opensciencegrid.org Dependencies RabbitMQ, RSV-perfSONAR, UNL HDFS, FNAL dCache, OIM","title":"Service Catalog for the OSG GRACC (Accounting) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GRACC/#service-catalog-for-the-osg-gracc-accounting-service","text":"Service Catalog OSG GRACC Service Name OSG GRACC Service Status Active Service Description GRACC serves as a repository for OSG accounting and monitoring data, integrating disparate data sources such as job accounting, transfer accounting, and network performance monitoring. Service Users WLCG accounting, OSG management, network engineers, and VOs wanting to monitor resource usage Service Area OSG Technology Service Level Agreement In progress Support Unit OSG Operations and Technology Support Contact Address help@opensciencegrid.org Dependencies RabbitMQ, RSV-perfSONAR, UNL HDFS, FNAL dCache, OIM","title":"Service Catalog for the OSG GRACC (Accounting) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GlideInFactory/","text":"Service Catalog for the OSG GlideIn Factory Service Service Catalog OSG GlideIn Factory Service Name OSG GlideIn Factory Service Status Active Service Description The OSG GlideIn Factory claims compute resources at a site on the request of the VO Frontend, and the resources in turn join the VO HTCondor Pool and become available for end users to run on. Service Users 12 OSG VOs (listed in SLA) Service Area OSG Operations at UCSD Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GlideinWMS, HTCondor, python, httpd, osg-ca-certs, fetch-crl, javascriptrrd, m2crypto","title":"Service Catalog for the OSG GlideIn Factory Service"},{"location":"ServiceManagement/ServiceCatalog/SC_GlideInFactory/#service-catalog-for-the-osg-glidein-factory-service","text":"Service Catalog OSG GlideIn Factory Service Name OSG GlideIn Factory Service Status Active Service Description The OSG GlideIn Factory claims compute resources at a site on the request of the VO Frontend, and the resources in turn join the VO HTCondor Pool and become available for end users to run on. Service Users 12 OSG VOs (listed in SLA) Service Area OSG Operations at UCSD Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies GlideinWMS, HTCondor, python, httpd, osg-ca-certs, fetch-crl, javascriptrrd, m2crypto","title":"Service Catalog for the OSG GlideIn Factory Service"},{"location":"ServiceManagement/ServiceCatalog/SC_HostedCE/","text":"Service Catalog for the OSG Hosted-CE Service Service Catalog OSG Hosted-CE Service Name OSG Hosted-CE Service Status Active Service Description Provides Compute Element (CE) interface to campus compute clusters. Service Users OSG and other VO Glidein Factories Service Area OSG User Support Service Level Agreement N/A Support Contact Address user-support@opensciencegrid.org Dependencies HTCondor","title":"Service Catalog for the OSG Hosted-CE Service"},{"location":"ServiceManagement/ServiceCatalog/SC_HostedCE/#service-catalog-for-the-osg-hosted-ce-service","text":"Service Catalog OSG Hosted-CE Service Name OSG Hosted-CE Service Status Active Service Description Provides Compute Element (CE) interface to campus compute clusters. Service Users OSG and other VO Glidein Factories Service Area OSG User Support Service Level Agreement N/A Support Contact Address user-support@opensciencegrid.org Dependencies HTCondor","title":"Service Catalog for the OSG Hosted-CE Service"},{"location":"ServiceManagement/ServiceCatalog/SC_OIM/","text":"Service Catalog for the OSG Information Management (OIM) Service Service Catalog OIM Service Name OIM Service Status Active Service Description The OIM service is the topology management system for the OSG. It holds information about people and resources involved in the OSG. The OIM service consists of a MySQL database and a web-based API. OIM is also used to satisfy certificate requests made by OSG users. Service Users Anyone with responsibilities for being contact point for an OSG resource. Those users who need certificates. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies MySQL, Apache","title":"Service Catalog for the OSG Information Management (OIM) Service"},{"location":"ServiceManagement/ServiceCatalog/SC_OIM/#service-catalog-for-the-osg-information-management-oim-service","text":"Service Catalog OIM Service Name OIM Service Status Active Service Description The OIM service is the topology management system for the OSG. It holds information about people and resources involved in the OSG. The OIM service consists of a MySQL database and a web-based API. OIM is also used to satisfy certificate requests made by OSG users. Service Users Anyone with responsibilities for being contact point for an OSG resource. Those users who need certificates. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Support Contact Address help@opensciencegrid.org Dependencies MySQL, Apache","title":"Service Catalog for the OSG Information Management (OIM) Service"},{"location":"ServiceManagement/ServiceProfile/SP_Display/","text":"Service Portfolio for the OSG Display Service Service Portfolio OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Problem Addressed Provide a simple near real time visualization of OSG activity, that can be understood by anyone with interest in OSG usage. Competitors and Similar Services Unique Selling Points Layperson view of overall OSG activity. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Additional Service Components Monitoring, Availability calculation, Configuration Management Database entries. Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM Cost to Provide Unused Funding Source Unused Pricing Unused","title":"Service Portfolio for the OSG Display Service"},{"location":"ServiceManagement/ServiceProfile/SP_Display/#service-portfolio-for-the-osg-display-service","text":"Service Portfolio OSG Display Service Name OSG Display Service Status Active Service Description OSG Display provides an overview of OSG resources and activities including geographic location and use of CPU and network resources. Service Users All with interest in an overview of OSG activity Problem Addressed Provide a simple near real time visualization of OSG activity, that can be understood by anyone with interest in OSG usage. Competitors and Similar Services Unique Selling Points Layperson view of overall OSG activity. Service Area OSG Operations Service Level Agreement Support Unit OSG Operations Additional Service Components Monitoring, Availability calculation, Configuration Management Database entries. Support Contact Address help@opensciencegrid.org Dependencies GRACC, OIM Cost to Provide Unused Funding Source Unused Pricing Unused","title":"Service Portfolio for the OSG Display Service"},{"location":"services/adding-external-cvmfs-repos/","text":"Adding External CVMFS Repos This document describes how to add an external repo like public-uc.osgstorage.org or ligo.storage.igwn.org. Use OSG Docs Authoritative source for adding new oasis repo is Install an OASIS Repository . Follow instructions starting at heading \"Creating a Repository\", your first command should be cvmfs_server mkfs When it asks you to open a ticket, if it's an osgstorage.org or opensciencegrid.org domain, then all you need to do is add the CVMFS repo to topology like: https://github.com/opensciencegrid/topology/pull/3986 Once you have completed adding the fetch-cvmfs-whitelist line to cron, you are done with the OSG documentation. Configure the external CVMFS repo In /etc/cvmfs/repositories.d/<reponame>/server.conf , add the lines: CVMFS_COMPRESSION_ALGORITHM=none CVMFS_GARBAGE_COLLECTION=true CVMFS_AUTO_GC=true CVMFS_AUTO_GC_TIMESPAN=\"2 days ago\" CVMFS_EXTERNAL_DATA=true CVMFS_AUTO_TAG_TIMESPAN=\"2 weeks ago\" Check in the file for duplicate lines of the above with different settings, comment ( # ) out those lines. The imporatant line is CVMFS_COMPRESSION_ALGORITHM . If it is set to the default, then CVMFS clients will expect the data to be delivered in compressed format, while the caches will deliver the file in un-compressed format. Configure CVMFS-sync cvmfs-sync synchonizes the data from an XRootD server (origin) to a CVMFS repo. Create a new config file (by copying another existing config) in /etc/cvmfs-sync . The name of the configuration should be <reponame>.config In the config, you will need to modify the repo, source and destination. This is where cvmfs-sync will scan for new files to add to the CVMFS repo. Make the systemd timer Copy an existing timer like: cp - r / etc / systemd / system / cvmfs - data - update @gwosc . osgstorage . org . service . d / etc / systemd / system / cvmfs - data - update @ < reponame > . service . d You may need to edit the override file in the directory above to change the user. Enable the timer: systemctl enable cvmfs-data-update@<reponame>.timer systemctl start cvmfs-data-update@<reponame>.timer Checking cvmfs-sync journalctl -u cvmfs-data-update@<reponame>","title":"Adding External CVMFS repos"},{"location":"services/adding-external-cvmfs-repos/#adding-external-cvmfs-repos","text":"This document describes how to add an external repo like public-uc.osgstorage.org or ligo.storage.igwn.org.","title":"Adding External CVMFS Repos"},{"location":"services/adding-external-cvmfs-repos/#use-osg-docs","text":"Authoritative source for adding new oasis repo is Install an OASIS Repository . Follow instructions starting at heading \"Creating a Repository\", your first command should be cvmfs_server mkfs When it asks you to open a ticket, if it's an osgstorage.org or opensciencegrid.org domain, then all you need to do is add the CVMFS repo to topology like: https://github.com/opensciencegrid/topology/pull/3986 Once you have completed adding the fetch-cvmfs-whitelist line to cron, you are done with the OSG documentation.","title":"Use OSG Docs"},{"location":"services/adding-external-cvmfs-repos/#configure-the-external-cvmfs-repo","text":"In /etc/cvmfs/repositories.d/<reponame>/server.conf , add the lines: CVMFS_COMPRESSION_ALGORITHM=none CVMFS_GARBAGE_COLLECTION=true CVMFS_AUTO_GC=true CVMFS_AUTO_GC_TIMESPAN=\"2 days ago\" CVMFS_EXTERNAL_DATA=true CVMFS_AUTO_TAG_TIMESPAN=\"2 weeks ago\" Check in the file for duplicate lines of the above with different settings, comment ( # ) out those lines. The imporatant line is CVMFS_COMPRESSION_ALGORITHM . If it is set to the default, then CVMFS clients will expect the data to be delivered in compressed format, while the caches will deliver the file in un-compressed format.","title":"Configure the external CVMFS repo"},{"location":"services/adding-external-cvmfs-repos/#configure-cvmfs-sync","text":"cvmfs-sync synchonizes the data from an XRootD server (origin) to a CVMFS repo. Create a new config file (by copying another existing config) in /etc/cvmfs-sync . The name of the configuration should be <reponame>.config In the config, you will need to modify the repo, source and destination. This is where cvmfs-sync will scan for new files to add to the CVMFS repo.","title":"Configure CVMFS-sync"},{"location":"services/adding-external-cvmfs-repos/#make-the-systemd-timer","text":"Copy an existing timer like: cp - r / etc / systemd / system / cvmfs - data - update @gwosc . osgstorage . org . service . d / etc / systemd / system / cvmfs - data - update @ < reponame > . service . d You may need to edit the override file in the directory above to change the user. Enable the timer: systemctl enable cvmfs-data-update@<reponame>.timer systemctl start cvmfs-data-update@<reponame>.timer","title":"Make the systemd timer"},{"location":"services/adding-external-cvmfs-repos/#checking-cvmfs-sync","text":"journalctl -u cvmfs-data-update@<reponame>","title":"Checking cvmfs-sync"},{"location":"services/ce-monitoring-dashboards/","text":"CE Monitoring Dashboards Links to CE Monitoring Dashboards: Ganglia - Miron GlideIn View OSG CPU/GPU Hours Table GRACC: GPU Utilization by Project / Site GRACC: OSG GPU Payload Jobs Summary","title":"CE Monitoring Dashboards"},{"location":"services/ce-monitoring-dashboards/#ce-monitoring-dashboards","text":"Links to CE Monitoring Dashboards: Ganglia - Miron GlideIn View OSG CPU/GPU Hours Table GRACC: GPU Utilization by Project / Site GRACC: OSG GPU Payload Jobs Summary","title":"CE Monitoring Dashboards"},{"location":"services/finalize-cache-registration/","text":"Finalizing New Cache Registration Once a new cache is registered with OSG, there are additional operations tasks that must be performed before it is usable by clients. The steps on this page are for OSG Operations; sysadmins should follow the cache registration document and open a support ticket to have these steps executed. Un-Authenticated Cache Test to make sure the cache is working by executing the following: console $ curl http://hcc-stash.unl.edu:8000/user/rynge/public/test.txt Hello! Open a pull request to add the cache to https://github.com/opensciencegrid/StashCache/blob/master/bin/caches.json (obsolete) file within the StashCache repo. Open a pull request adding the cache to CVMFS_EXTERNAL_URL in the https://github.com/opensciencegrid/oasis-server/blob/master/goc/config-osg/etc/cvmfs/domain.d/osgstorage.org.conf (obsolete) file. Authenticated Cache For an authenticated cache, it will need to be added to the specific CVMFS configuration for the authenticated domain. For example, if it is a LIGO authenticated cache, it will need to be added to the CVMFS_EXTERNAL_URL within the ligo.osgstorage.org.conf file in the https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete) directory. A CMS authenticated cache will need to be added to the cms.osgstorage.org.conf file Open a pull request adding the authenticated cache to CVMFS_EXTERNAL_URL in the appropriate domain configuration file within https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete). Coordinate with the VO to test that authorization works. As each VO is expected to export a different directory and require different authorizations, a custom test must be arranged each time.","title":"Finalize Cache Registration"},{"location":"services/finalize-cache-registration/#finalizing-new-cache-registration","text":"Once a new cache is registered with OSG, there are additional operations tasks that must be performed before it is usable by clients. The steps on this page are for OSG Operations; sysadmins should follow the cache registration document and open a support ticket to have these steps executed.","title":"Finalizing New Cache Registration"},{"location":"services/finalize-cache-registration/#un-authenticated-cache","text":"Test to make sure the cache is working by executing the following: console $ curl http://hcc-stash.unl.edu:8000/user/rynge/public/test.txt Hello! Open a pull request to add the cache to https://github.com/opensciencegrid/StashCache/blob/master/bin/caches.json (obsolete) file within the StashCache repo. Open a pull request adding the cache to CVMFS_EXTERNAL_URL in the https://github.com/opensciencegrid/oasis-server/blob/master/goc/config-osg/etc/cvmfs/domain.d/osgstorage.org.conf (obsolete) file.","title":"Un-Authenticated Cache"},{"location":"services/finalize-cache-registration/#authenticated-cache","text":"For an authenticated cache, it will need to be added to the specific CVMFS configuration for the authenticated domain. For example, if it is a LIGO authenticated cache, it will need to be added to the CVMFS_EXTERNAL_URL within the ligo.osgstorage.org.conf file in the https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete) directory. A CMS authenticated cache will need to be added to the cms.osgstorage.org.conf file Open a pull request adding the authenticated cache to CVMFS_EXTERNAL_URL in the appropriate domain configuration file within https://github.com/opensciencegrid/oasis-server/tree/master/goc/config-osg/etc/cvmfs/config.d (obsolete). Coordinate with the VO to test that authorization works. As each VO is expected to export a different directory and require different authorizations, a custom test must be arranged each time.","title":"Authenticated Cache"},{"location":"services/gracc-corrections/","text":"Installing GRACC Corrections GRACC Corrections are used to modify records during the summarization process. RAW records are not modified in the correction process. The correction is applied after summarization and aggregation, but before the record is enriched with data from Topology . The correction is step 3 in the GRACC summary record workflow: Raw record is received. The raw record is never modified Summarizer aggregates the raw records Corrections are applied Summarized records are enriched by Topology Summarized and enriched records are uploaded to GRACC We can currently correct: VO Names Project Names OIM_Site (using the Host_description field) Limitations Additional corrections can be written, but some attributes are used to detect duplicate records, and are therefore protected from corrections. Protected records for summarization are: EndTime, RawVOName, RawProjectName, DN, Processors, ResourceType, CommonName, Host_description, Resource_ExitCode, Grid, ReportableVOName, ProbeName For example, we could not write a correction for the Host_description . If we had a correction that changed Host_description , then the duplicate detection would not detect the same record during resummarization and it would have duplicate summarized records. Command Line The gracc-correct tool is used to create, update, and delete corrections. The tool must be run from a host that can write to GRACC, which is very restricted. It is recommended to run the gracc-correct tool directly from the gracc.opensciencegrid.org host. The gracc-correct tool is able to parse new corrections either individually from user input or many at once from a CSV file. User Input Each correction attempts to match one or more attributes of the summarized record in order to set another attribute. For example, for the VO correction: $ gracc-correct vo add Field ( s ) to correct: VOName: <vo> ReportableVOName: <reportable_vo> Corrected VOName: <new_vo_name> CSV File A CSV file can be specified in order to specify multiple corrections in a single batch update. The CSV file must be of a certain format. No Header Row The number of columns must be at least the number of matching attributes and the corrected attribute. For example, a CSV file for VO corrections would be of format: <VOName>,<ReportableVOName>,<CorrectedVOName>,.... The CSV file can be specified on the command line with the option --csv , for example: ./gracc-correct vo add --csv <csv_file>","title":"GRACC Corrections"},{"location":"services/gracc-corrections/#installing-gracc-corrections","text":"GRACC Corrections are used to modify records during the summarization process. RAW records are not modified in the correction process. The correction is applied after summarization and aggregation, but before the record is enriched with data from Topology . The correction is step 3 in the GRACC summary record workflow: Raw record is received. The raw record is never modified Summarizer aggregates the raw records Corrections are applied Summarized records are enriched by Topology Summarized and enriched records are uploaded to GRACC We can currently correct: VO Names Project Names OIM_Site (using the Host_description field)","title":"Installing GRACC Corrections"},{"location":"services/gracc-corrections/#limitations","text":"Additional corrections can be written, but some attributes are used to detect duplicate records, and are therefore protected from corrections. Protected records for summarization are: EndTime, RawVOName, RawProjectName, DN, Processors, ResourceType, CommonName, Host_description, Resource_ExitCode, Grid, ReportableVOName, ProbeName For example, we could not write a correction for the Host_description . If we had a correction that changed Host_description , then the duplicate detection would not detect the same record during resummarization and it would have duplicate summarized records.","title":"Limitations"},{"location":"services/gracc-corrections/#command-line","text":"The gracc-correct tool is used to create, update, and delete corrections. The tool must be run from a host that can write to GRACC, which is very restricted. It is recommended to run the gracc-correct tool directly from the gracc.opensciencegrid.org host. The gracc-correct tool is able to parse new corrections either individually from user input or many at once from a CSV file.","title":"Command Line"},{"location":"services/gracc-corrections/#user-input","text":"Each correction attempts to match one or more attributes of the summarized record in order to set another attribute. For example, for the VO correction: $ gracc-correct vo add Field ( s ) to correct: VOName: <vo> ReportableVOName: <reportable_vo> Corrected VOName: <new_vo_name>","title":"User Input"},{"location":"services/gracc-corrections/#csv-file","text":"A CSV file can be specified in order to specify multiple corrections in a single batch update. The CSV file must be of a certain format. No Header Row The number of columns must be at least the number of matching attributes and the corrected attribute. For example, a CSV file for VO corrections would be of format: <VOName>,<ReportableVOName>,<CorrectedVOName>,.... The CSV file can be specified on the command line with the option --csv , for example: ./gracc-correct vo add --csv <csv_file>","title":"CSV File"},{"location":"services/hosted-ce-definitions/","text":"OSG Hosted CE Definitions The OSG provides a Hosted CE service. In general, this document lists what an instance of that service can and cannot do. Hosted CEs in General Benefits The site continues to operate its own batch system according to local considerations; OSG operates the interface between OSG and the site, aka the Hosted CE; To the site, OSG simply looks like a set of user accounts; and OSG uses the accounts to provision site resources for various science user communities, and hence the site has complete control over resource allocation via local policies on the accounts. Prerequisites In general, the site must operate a working batch system that is accessible via at least one head node; OSG works with HTCondor, Slurm, PBS Pro/Torque, LSF, and Grid Engine. Site operations include hardware and software maintenance, defining and implementing usage policies, monitoring, troubleshooting, etc. These are the same activities to support local users. In addition, the site: Must communicate with OSG their intent to share resources \u2014 in most cases, a meeting between site and OSG staff should be sufficient to discuss goals, plans, etc.; Must meet the technical requirements on the OSG website , summarized below: The site is willing to add OSG user accounts with inbound SSH access and submit privileges, A mechanism exists for transferring files between the head nodes and worker nodes, and Worker nodes must have outbound Internet access and temporary storage space for jobs. Is strongly encouraged to tell OSG about preferred constraints on resource requests (e.g., per-job limits on CPUs, memory, and storage; overall limits on number of running and idle jobs; submission rates), so that OSG can tailor such requests to better fit the site. Standard Hosted CE A Standard Hosted CE is the default case in which the interaction between OSG and the site is relatively simple and easy to maintain. Most sites fall into this category. Benefits Configuration is limited to basics, so there is less upfront and ongoing work for OSG and the site; OSG maintains and shares mappings from user groups to OSG user accounts on the site, so that the site can \u2014 if desired \u2014 limit resource allocations to certain groups; and OSG maintains the required OSG configuration on the site\u2019s head node and worker nodes (if the site provides a distribution mechanism to worker nodes, such as a shared file system). Site Responsibilities In addition to the general prerequisites above, the following apply to a Standard Hosted CE: The site must create and maintain 20 OSG user accounts on a single head node; note that: OSG will access their accounts via SSH using one RSA key for all 20 accounts; and All 20 OSG accounts must be able to submit to the local batch system. The site may control the resources allocated to different OSG user groups by writing and maintaining policies on the OSG user accounts within the batch system. The site provides privilege separation among the OSG user groups via the OSG user accounts and standard Unix privilege separation.","title":"Hosted CE Definitions"},{"location":"services/hosted-ce-definitions/#osg-hosted-ce-definitions","text":"The OSG provides a Hosted CE service. In general, this document lists what an instance of that service can and cannot do.","title":"OSG Hosted CE Definitions"},{"location":"services/hosted-ce-definitions/#hosted-ces-in-general","text":"","title":"Hosted CEs in General"},{"location":"services/hosted-ce-definitions/#benefits","text":"The site continues to operate its own batch system according to local considerations; OSG operates the interface between OSG and the site, aka the Hosted CE; To the site, OSG simply looks like a set of user accounts; and OSG uses the accounts to provision site resources for various science user communities, and hence the site has complete control over resource allocation via local policies on the accounts.","title":"Benefits"},{"location":"services/hosted-ce-definitions/#prerequisites","text":"In general, the site must operate a working batch system that is accessible via at least one head node; OSG works with HTCondor, Slurm, PBS Pro/Torque, LSF, and Grid Engine. Site operations include hardware and software maintenance, defining and implementing usage policies, monitoring, troubleshooting, etc. These are the same activities to support local users. In addition, the site: Must communicate with OSG their intent to share resources \u2014 in most cases, a meeting between site and OSG staff should be sufficient to discuss goals, plans, etc.; Must meet the technical requirements on the OSG website , summarized below: The site is willing to add OSG user accounts with inbound SSH access and submit privileges, A mechanism exists for transferring files between the head nodes and worker nodes, and Worker nodes must have outbound Internet access and temporary storage space for jobs. Is strongly encouraged to tell OSG about preferred constraints on resource requests (e.g., per-job limits on CPUs, memory, and storage; overall limits on number of running and idle jobs; submission rates), so that OSG can tailor such requests to better fit the site.","title":"Prerequisites"},{"location":"services/hosted-ce-definitions/#standard-hosted-ce","text":"A Standard Hosted CE is the default case in which the interaction between OSG and the site is relatively simple and easy to maintain. Most sites fall into this category.","title":"Standard Hosted CE"},{"location":"services/hosted-ce-definitions/#benefits_1","text":"Configuration is limited to basics, so there is less upfront and ongoing work for OSG and the site; OSG maintains and shares mappings from user groups to OSG user accounts on the site, so that the site can \u2014 if desired \u2014 limit resource allocations to certain groups; and OSG maintains the required OSG configuration on the site\u2019s head node and worker nodes (if the site provides a distribution mechanism to worker nodes, such as a shared file system).","title":"Benefits"},{"location":"services/hosted-ce-definitions/#site-responsibilities","text":"In addition to the general prerequisites above, the following apply to a Standard Hosted CE: The site must create and maintain 20 OSG user accounts on a single head node; note that: OSG will access their accounts via SSH using one RSA key for all 20 accounts; and All 20 OSG accounts must be able to submit to the local batch system. The site may control the resources allocated to different OSG user groups by writing and maintaining policies on the OSG user accounts within the batch system. The site provides privilege separation among the OSG user groups via the OSG user accounts and standard Unix privilege separation.","title":"Site Responsibilities"},{"location":"services/install-gwms-factory/","text":"GlideinWMS Factory Installation This document describes how to install a Glidein Workflow Managment System (GlideinWMS) Factory instance. This document assumes expertise with HTCondor and familiarity with the GlideinWMS software. It does not cover anything but the simplest possible install. Please consult the GlideinWMS reference documentation for advanced topics, including non-root, non-RPM-based installation. In this document the terms glidein and pilot (job) will be used interchangeably. This parts covers these primary components of the GlideinWMS system: WMS Collector / Schedd : A set of condor_collector and condor_schedd processes that allow the submission of pilots to Grid entries. GlideinWMS Factory : The process submitting the pilots when needed Warning We really recommend you to use the OSG provided Factory and not to install your own . A VO Frontend is sufficient to submit your jobs and to decide scheduling policies. And this will avoid for you the complexity to deal directly with grid/cloud sites. If you really need you own Factory be aware that it is a complex component and may require a non trivial maintenance effort. Before Starting Before starting the installation process, consider the following points (consulting the Reference section below as needed): Requirements Host and OS A host to install the GlideinWMS Factory (pristine node). Currently most of our testing has been done on Scientific Linux 6 and 7. Root access The GlideinWMS Factory has the following requirements: CPU : 4-8 cores for a large installation (1 should suffice on a small install) RAM : 4-8GB on a large installation (1GB should suffice for small installs) Disk : 10GB will be plenty sufficient for all the binaries, config and log files related to GlideinWMS. If you are a large site with need to keep significant history and logs, you may want to allocate 100GB+ to store long histories. Users The GlideinWMS Factory installation will create the following users unless they are already created . User Default uid Comment condor none HTCondor user (installed via dependencies). gfactory none This user runs the GlideinWMS VO factory. To verify that the user gfactory has gfactory as primary group check the output of root@host # getent passwd gfactory | cut -d: -f4 | xargs getent group It should be the gfactory group. Certificates Certificate User that owns certificate Path to certificate Host certificate root /etc/grid-security/hostcert.pem /etc/grid-security/hostkey.pem Here are instructions to request a host certificate. The host certificate/key is used for authorization, however, authorization between the Factory and the GlideinWMS collector is done by file system authentication. Networking Firewalls It must be on the public internet, with at least one port open to the world; all worker nodes will load data from this node trough HTTP. Note that worker nodes will also need outbound access in order to access this HTTP port. Installation Procedure As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: Ensure the host has a supported operating system Obtain root access to the host Prepare the required Yum repositories Install CA certificates Installing HTCondor Most required software is installed from the Factory RPM installation. HTCondor is the only exception since there are many different ways to install it , using the RPM system or not. You need to have HTCondor installed before installing the GlideinWMS Factory. If yum cannot find a HTCondor RPM, it will install the dummy empty-condor RPM, assuming that you installed HTCondor using a tarball distribution. If you don't have HTCondor already installed, you can install the HTCondor RPM from the OSG repository: root@host # yum install condor.x86_64 Installing HTCondor-BOSCO If you plan to send jobs using direct batch submission (aka BOSCO), then you need also the condor-bosco package. You'll have to install the package and remove one of its files /etc/condor/config.d/60-campus_factory.config because it interferes with the Factory configuration. root@host # yum install condor-bosco root@host # rm /etc/condor/config.d/60-campus_factory.config root@host # touch /etc/condor/config.d/60-campus_factory.config Install GWMS Factory Download and install the Factory RPM Install the RPM and dependencies (be prepared for a lot of dependencies). root@host # yum install glideinwms-factory This will install the current production release verified and tested by OSG with default HTCondor configuration. This command will install the GlideinWMS Factory, HTCondor, the OSG client, and all the required dependencies. If you wish to install a different version of GlideinWMS, add the \"--enablerepo\" argument to the command as follows: yum install --enablerepo=osg-testing glideinwms-factory : The most recent production release, still in testing phase. This will usually match the current tarball version on the GlideinWMS home page . (The osg-release production version may lag behind the tarball release by a few weeks as it is verified and packaged by OSG). Note that this will also take the osg-testing versions of all dependencies as well. yum install --enablerepo=osg-upcoming glideinwms-factory : The most recent development series release, ie version 3.3.x release. This has newer features such as cloud submission support, but is less tested. Download HTCondor tarballs You will need to download HTCondor tarballs for each architecture that you want to deploy pilots on . At this point, GlideinWMS factory does not support pulling HTCondor binaries from your system area. Suggested is that you put these binaries in /var/lib/gwms-factory/condor but any gfactory accessible location should suffice. Configuration Procedure After installing the RPM you need to configure the components of the GlideinWMS Factory: Edit Factory configuration options Edit HTCondor configuration options Create a HTCondor grid map file Reconfigure and Start Factory Configuring the Factory The configuration file is /etc/gwms-factory/glideinWMS.xml . The next steps will describe each line that you will need to edit for most cases, but you may want to review the whole file to be sure that it is configured correctly. Security configuration In the security section, you will need to provide each Frontend that is allowed to communicate with the Factory: security key_length=\"2048\" pub_key=\"RSA\" remove_old_cred_age=\"30\" remove_old_cred_freq=\"24\" reuse_oldkey_onstartup_gracetime=\"900\"> <frontends> <frontend identity= \"vofrontend_service@FACTORY_COLLECTOR_HOSTNAME\" name= \"%ORANGE%vofrontend_sec_name%ENDCOLOR%\" > <security_classes> <security_class name= \"%RED%frontend_sec_class%ENDCOLOR%\" username= \"frontend\" /> </security_classes> </frontend> </frontends> </security> These attributes are very important to get exactly right or the Frontend will not be trusted. This should match one of the factory and security sections of the Frontend configuration Configuring the GlideinWMS Frontend in the following way: Note This is a snippet from the Frontend configuration (for reference), not the Factory that you are configuring now! For the factory section: # from frontend.xml <factory query_expr= '((stringListMember(\"VO\", GLIDEIN_Supported_VOs)))' > .... <collectors> <collector DN= \"/DC=org/DC=doegrids/OU=Services/CN=FACTORY_COLLECTOR_HOSTNAME\" comment= \"Define factory collector globally for simplicity\" factory_identity= \"gfactory@FACTORY_COLLECTOR_HOSTNAME\" my_identity= \"%GREEN%username%ENDCOLOR%@FACTORY_COLLECTOR_HOSTNAME\" node= \"FACTORY_COLLECTOR_HOSTNAME\" /> </collectors> </factory> For the security: # from frontend.xml <security classad_proxy= \"/tmp/vo_proxy\" proxy_DN= \"DN of vo_proxy\" proxy_selection_plugin= \"ProxyAll\" security_name= \"The security name, this is used by factory\" sym_key= \"aes_256_cbc\" > <credentials> <credential absfname= \"/tmp/pilot_proxy\" security_class= \"frontend\" trust_domain= \"OSG\" type= \"grid_proxy\" /> </credentials> </security> Note that the identity of the Frontend must match what HTCondor authenticates the DN of the frontend to. In /etc/condor/certs/condor_mapfile , there must be an entry with vofrontend_service definition (in this case): GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=Services\\/CN\\=Some\\ Name\\ 834323%ENDCOLOR%$\" % GREEN % vofrontend_service % ENDCOLOR % Entry configuration Entries are grid/cloud endpoints (aka Compute Elements, or gatekeepers) that can accept job requests and run pilots (which will run user jobs). Each entry needs to be configured to communicate to a specific gatekeeper. An example test entry is provided in the default GlideinWMS configuration file. At the very least, you will need to modify the entry line: <entry name= \"%RED%ENTRY_NAME%ENDCOLOR%\" enabled= \"True\" auth_method= \"grid_proxy\" trust_domain= \"OSG\" gatekeeper= \"%RED%gatekeeper.domain.tld/jobmanager-type%ENDCOLOR%\" gridtype= \"gt2\" rsl= \"(queue=default)(jobtype=single)\" schedd_name= \"%RED%schedd_glideins2@FACTORY_HOSTNAME%ENDCOLOR%\" verbosity= \"std\" work_dir= \"OSG\" > You will need to modify the entry name and gatekeeper . This will determine the gatekeeper that you access. Specific gatekeepers often require specific \"rsl\" attributes that determine the job queue that you are submitting to, or other attributes. Add them in the rsl attribute. Also, be sure to distribute your entries across the various HTCondor schedd work managers to balance load. To see the available schedd use condor_status -schedd -l | grep Name . Several schedd options are configured by default for you: schedd_glideins2, schedd_glideins3, schedd_glideins4, schedd_glideins5 , as well as the default schedd . This can be modified in the HTCondor configuration. Add any specific options, such as limitations on jobs/pilots or glexec/voms requirements in the entry section below the above line. More details are in the GlideinWMS Factory configuration guide . !!! warning If there is no match between auth_metod and trust_domain of the entry and the type and trust_domain listed in one of the credentials of one of the Frontends using this Factory, then no job can run on that entry. The Factory must advertise the correct Resource Name of each entry for accounting purposes. Then the Factory must also advertise in the entry all the attributes that will allow to match the query expression used in the Frontends connecting to this Factory (e.g. <factory query_expr='((stringListMember(\"%PINK%VO%ENDCOLOR%\", GLIDEIN_Supported_VOs)))'> as explained in the VO frontend configuration document ). Note Keep an eye on this part as we're dealing with singularity. Then you must advertise correctly if the site supports gLExec . If it does not set GLEXEC_BIN to NONE , if gLExec is installed via OSG set it to OSG , otherwise set it to the path of gLExec. For example this snippet advertises GLIDEIN_Supported_VOs attribute with the supported VO so that can be used with the query above in the VO frontend and says that the resource does not support gLExec: <entry name= \"RESOURCE_NAME\" ... <config > ... <attrs> ... <attr name= \"GLIDEIN_Supported_VOs\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%PINK%VO%ENDCOLOR%\" /> <attr name= \"GLEXEC_BIN\" const= \"True\" glidein_publish= \"False\" job_publish= \"False\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%NONE%ENDCOLOR%\" /> <attr name= \"GLIDEIN_Resource_Name\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%SiteNameFromOIM%ENDCOLOR%\" /> </attrs> Note Specially if jobs are sent to OSG resources, it is very important to set the GLIDEIN_Resource_Name and to be consistent with the Resource Name reported in OIM because that name will be used for job accounting in Gratia. It should be the name of the Resource in OIM or the name of the Resource Group (specially if there are many gatekeepers submitting to the same cluster). More information on options can be found here Configuring Tarballs Each pilot will download HTCondor binaries from the staging area. Often, multiple binaries are needed to support various architectures and platforms. Currently, you will need to provide at least one tarball for GlideinWMS to use. (Using the system binaries is currently not supported). Download a HTCondor tarball from here . Suggested is to put the binaries in /var/lib/gwms-factory/condor , but any factory-accessible location will do just fine. Once you have downloaded the tarball, configure it in /etc/gwms-factory/glideinWMS.xml like in the following: <condor_tarball arch= \"default\" base_dir= \"/var/lib/gwms-factory/condor/condor-8.7.6-x86_64_RedHat6-stripped\" os= \"rhel6\" version= \"default\" /> Remember also to modify the condor_os and condor_arch attributes in the entries (the configured Compute Elements) to pick the correct HTCondor binary. Here are more details on using multiple HTCondor binaries. Note that is sufficient to set the base_dir ; the reconfigure command will prepare the tarball and add it to the XML config file. Configuring HTCondor The HTCondor configuration for the Factory is placed in /etc/condor/config.d . 00_gwms_factory_general.config 00-restart_peaceful.config 01_gwms_factory_collectors.config 02_gwms_factory_schedds.config 03_gwms_local.config 10-batch_gahp_blahp.config Get rid of the pre-loaded HTCondor default root@host # rm /etc/condor/config.d/00personal_condor.config root@host # touch /etc/condor/config.d/00personal_condor.config For most installations, the items you need to modify are in 03_gwms_factory_local.config . The lines you will have to edit are: Credentials of the machine. You can either run using a proxy, or a service certificate. It is recommended to use a host certificate and specify its location in the variables GSI_DAEMON_CERT and GSI_DAEMON_KEY . The host certificate should be owned by root and have the correct permissions, 600. HTCondor ids in the form UID.GID (both are integers) HTCondor admin email. Will receive messages when services fail. #-- HTCondor user: condor CONDOR_IDS = #-- Contact (via email) when problems occur CONDOR_ADMIN = ############################ # GSI Security config ############################ #-- Grid Certificate directory GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates #-- Credentials GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem #-- HTCondor mapfile CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile ################################### # Whitelist of HTCondor daemon DNs ################################### #DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD, STARTD Using other HTCondor RPMs, e.g. UW Madison HTCondor RPM The above procedure will work if you are using the OSG HTCondor RPMS. You can verify that you used the OSG HTCondor RPM by using yum list condor . The version name should include \"osg\", e.g. 8.6.9-1.1.osg34.el7 . If you are using the UW Madison HTCondor RPMS, be aware of the following changes: This HTCondor RPM uses a file /etc/condor/condor_config.local to add your local machine slot to the user pool. If you want to disable this behavior (recommended), you should blank out that file or comment out the line in /etc/condor/condor_config for LOCAL_CONFIG_FILE. (Make sure that LOCAL_CONFIG_DIR is set to /etc/condor/config.d ) Note that the variable LOCAL_DIR is set differently in UW Madison and OSG RPMs. This should not cause any more problems in the Glideinwms RPMs, but please take note if you use this variable in your job submissions or other customizations. In general if you are using a non OSG RPM or if you added custom configuration files for HTCondor please check the order of the configuration files: root@host # condor_config_val -config Configuration source: /etc/condor/condor_config Local configuration sources: /etc/condor/config.d/00-restart_peaceful.config /etc/condor/config.d/00_gwms_factory_general.config /etc/condor/config.d/01_gwms_factory_collectors.config /etc/condor/config.d/02_gwms_factory_schedds.config /etc/condor/config.d/03_gwms_local.config /etc/condor/config.d/10-batch_gahp_blahp.config /etc/condor/condor_config.local Restarting HTCondor After configuring HTCondor, be sure to restart HTCondor: root@host # service condor restart Create a HTCondor grid mapfile. The HTCondor grid mapfile /etc/condor/certs/condor_mapfile is used for authentication between the glidein running on a remote worker node, and the local collector. HTCondor uses the mapfile to map certificates to pseudo-users on the local machine. It is important that you map the DN's of each frontend you are talking to. Below is an example mapfile, by default found in /etc/condor/certs/condor_mapfile : GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=People\\/CN\\=Some\\ Name\\ 123456$\" frontend GSI (.*) anonymous FS (.*) \\1 Each frontend needs a line that maps to the user specified in the identity argument in the frontend security section of the Factory configuration. Reconfiguring GlideinWMS After changing the configuration of GlideinWMS and making sure that Factory is running, use the following table to find the appropriate command for your operating system (run as root ): If your operating system is... Run the following command... Enterprise Linux 7 systemctl reload gwms-factory Enterprise Linux 6 service gwms-factory reconfig Note Notice that, in the case of Enterprise Linux 7 systemctl reload gwms-factory will work only if: - gwms-factory service is running - gwms-factory service was started with systemctl Otherwise, you will get the following error in any of the cases: # systemctl reload gwms-factory Job for gwms-factory.service invalid. Upgrading GlideinWMS Before you start the Factory service for the first time or after an update of the RPM or after you change GlideinWMS scripts, you should always use the GlideinWMS \"upgrade\" command. To do so: Make sure the condor and gwms-factory services are stopped (in EL6 this will be done for you). Issue the upgrade command: If you are using Enterprise Linux 7: root@host # /usr/sbin/gwms-factory upgrade If you are using Enterprise Linux 6: root@host # service gwms-factory upgrade Start the condor and gwms-factory services (see next part). Service Activation and Deactivation To start the Factory you must start also HTCondor and the Web server beside the Factory itself: # %RED%For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service condor start root@host # service httpd start root@host # service gwms-factory start # %RED% For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl start condor root@host # systemctl start httpd root@host # systemctl start gwms-factory Note Once you successfully start using the Factory service, anytime you change the /etc/gwms-factory/glideinWMS.xml file you will need to run a reconfig/reload command. If you change also some code you need the upgrade command mentioned above: # %RED% For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service gwms-factory reconfig # %RED% But the situation is a bit more complicated in RHEL 7 , CentOS 7 , and SL7 due to systemd restrictions%ENDCOLOR% # %GREEN% For reconfig:%ENDCOLOR% A. %RED% when the Factory is running%ENDCOLOR% A.1 %RED% without any additional options%ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig%ENDCOLOR% or root@host # systemctl reload gwms-factory A.2 %RED% if you want to give additional options %ENDCOLOR% systemctl stop gwms-factory /usr/sbin/gwms-factory reconfig \"and your options\" systemctl start gwms-factory B. %RED% when the Factory is NOT running %ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig ( \"and your options\" ) To enable the services so that they restart after a reboot: # %RED%# For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # /sbin/chkconfig fetch-crl-cron on root@host # /sbin/chkconfig fetch-crl-boot on root@host # /sbin/chkconfig condor on root@host # /sbin/chkconfig httpd on root@host # /sbin/chkconfig gwms-factory on # %RED%# For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl enable fetch-crl-cron root@host # systemctl enable fetch-crl-boot root@host # systemctl enable condor root@host # systemctl enable httpd root@host # systemctl enable gwms-factory To stop the Factory: # %RED%For RHEL 6 , CentOS 6 , and SL6 %ENDCOLOR% root@host # service gwms-factory stop # %RED%For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl stop gwms-factory And you can stop also the other services if you are not using them independently of the Factory. Validating GlideinWMS Factory The complete validation of the Factory is the submission of actual jobs. You can also check that the services are up and running: root@host # condor_status -any MyType TargetType Name glidefactoryclient None 12345_TEST_ENTRY@gfactory_instance@ glideclient None 12345_TEST_ENTRY@gfactory_instance@ glidefactory None TEST_ENTRY@gfactory_instance@ glidefactoryglobal None gfactory_instance@gfactory_ser glideclientglobal None gfactory_instance@gfactory_ser Scheduler None hostname.fnal.gov DaemonMaster None hostname.fnal.gov Negotiator None hostname.fnal.gov Scheduler None schedd_glideins2@hostname Scheduler None schedd_glideins3@hostname Scheduler None schedd_glideins4@hostname Scheduler None schedd_glideins5@hostname Collector None wmscollector_service@hostname You should have one \"glidefactory\" classAd for each entry that you have enabled. If you have already configured the frontends, you will also have one glidefactoryclient and one glideclient classAd for each frontend / entry. You can check also the monitoring Web page: http://YOUR_HOST_FQDN/factory/monitor/ You can also test the local submission of a job to a resource using the test script local_start.sh but you must first install the OSG client tools and generate a proxy. After that you can run the test (replace ENTRY_NAME with the name of one of the entries in /etc/gwms-factory/glideinWMS.xml ): Check Web server configuration for the monitoring Verify path and specially the URL for the GlideinWMS files served by your web server: stage base_dir = \"/var/lib/gwms-factory/web-area/stage\" use_symlink = \"True\" web_base_url = \"http://HOSTNAME:PORT/factory/stage\" This will determine the location of your web server . Make sure that the URL is visible. Depending on your firewall or the one of your organization, you may need to change the port here and in the httpd configuration (by modifying the \"Listen\" directive in /etc/httpd/conf/httpd.conf ). Note that web servers are an often an attacked piece of infrastruture, so you may want to go through the Apache configuration in /etc/httpd/conf/httpd.conf and disable unneeded modules. Troubleshooting GlideinWMS Factory File Locations File Description File Location Comment Configuration file /etc/gwms-factory/glideinWMS.xml Main configuration file Logs /var/log/gwms-factory/server/factory Overall server logs /var/log/gwms-factory/server/entry_NAME Specific entry logs (generally more useful) /var/log/gwms-factory/client Glidein Pilot logs seperated by user and entry Startup script /etc/init.d/gwms-factory Web Directory /var/lib/gwms-factory/web-area Web Base /var/lib/gwms-factory/web-base Working Directory /var/lib/gwms-factory/work-dir/ Increase the log level and change rotation policies You can increase the log level of the frontend. To add a log file with all the log information add the following line with all the message types in the process_log section of /etc/gwms-factory/glideinWMS.xml : <log_retention> <process_logs> <process_log extension= \"all\" max_days= \"7.0\" max_mbytes= \"100.0\" min_days= \"3.0\" msg_types= \"DEBUG,EXCEPTION,INFO,ERROR,ERR\" /> You can also change the rotation policy and choose whether compress the rotated files, all in the same section of the config files: max_bytes is the max size of the log files max_days it will be rotated. compression specifies if rotated files are compressed backup_count is the number of rotated log files kept Further details are in the reference documentation . Failed authentication errors If you get messages such as these in the logs, the Factory does not trust the frontend and will not submit glideins. WARNING: Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) not in white list. Skipping request This error means that the frontend name in the security section of the Factory does not match the security_name in the frontend. Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) is not coming from a trusted source; AuthenticatedIdentity vofrontend_condor@fermicloud130.fnal.gov!=vofrontend_factory@fermicloud130.fnal.gov. Skipping for security reasons. This error means that the identity in the security section of the Factory does not match what the /etc/condor/certs/condor_mapfile authenticates the Frontend to in HTCondor (!Authenticated Identity in the classad). Make sure the attributes are correctly lined up as in the Frontend security configuration section above. Glideins start but do not connect to User pool / VO Frontend Check the appropriate job err and out logs in /var/log/gwms-factory/client to see if any errors were reported. Often, this will be a pilot unable to access a web server or with an invalid proxy. Also, verify that the condor_mapfile is correct on the VO Frontend's user pool collector and configuration. Glideins start but fail before running job with error \"Proxy not long lived enough\" If the glideins are running on a resource (entry) but the jobs are not running and the log files in /var/log/gwms-factory/client/user_frontend/glidein_gfactory_instance/ENTRY_NAME report an error like \"Proxy not long lived enough (86096 s left), shortened retire time ...\", then probably the HTCondor RLM on the Compute Element is delegating the proxy and shortening its lifespan. This can be fixed by setting DELEGATE_JOB_GSI_CREDENTIALS = FALSE as suggested in the CE install document . References http://glideinwms.fnal.gov/doc.prd/ https://opensciencegrid.org/docs/other/install-gwms-frontend/","title":"Installing GlideinWMS Factory"},{"location":"services/install-gwms-factory/#glideinwms-factory-installation","text":"This document describes how to install a Glidein Workflow Managment System (GlideinWMS) Factory instance. This document assumes expertise with HTCondor and familiarity with the GlideinWMS software. It does not cover anything but the simplest possible install. Please consult the GlideinWMS reference documentation for advanced topics, including non-root, non-RPM-based installation. In this document the terms glidein and pilot (job) will be used interchangeably. This parts covers these primary components of the GlideinWMS system: WMS Collector / Schedd : A set of condor_collector and condor_schedd processes that allow the submission of pilots to Grid entries. GlideinWMS Factory : The process submitting the pilots when needed Warning We really recommend you to use the OSG provided Factory and not to install your own . A VO Frontend is sufficient to submit your jobs and to decide scheduling policies. And this will avoid for you the complexity to deal directly with grid/cloud sites. If you really need you own Factory be aware that it is a complex component and may require a non trivial maintenance effort.","title":"GlideinWMS Factory Installation"},{"location":"services/install-gwms-factory/#before-starting","text":"Before starting the installation process, consider the following points (consulting the Reference section below as needed):","title":"Before Starting"},{"location":"services/install-gwms-factory/#requirements","text":"","title":"Requirements"},{"location":"services/install-gwms-factory/#host-and-os","text":"A host to install the GlideinWMS Factory (pristine node). Currently most of our testing has been done on Scientific Linux 6 and 7. Root access The GlideinWMS Factory has the following requirements: CPU : 4-8 cores for a large installation (1 should suffice on a small install) RAM : 4-8GB on a large installation (1GB should suffice for small installs) Disk : 10GB will be plenty sufficient for all the binaries, config and log files related to GlideinWMS. If you are a large site with need to keep significant history and logs, you may want to allocate 100GB+ to store long histories.","title":"Host and OS"},{"location":"services/install-gwms-factory/#users","text":"The GlideinWMS Factory installation will create the following users unless they are already created . User Default uid Comment condor none HTCondor user (installed via dependencies). gfactory none This user runs the GlideinWMS VO factory. To verify that the user gfactory has gfactory as primary group check the output of root@host # getent passwd gfactory | cut -d: -f4 | xargs getent group It should be the gfactory group.","title":"Users"},{"location":"services/install-gwms-factory/#certificates","text":"Certificate User that owns certificate Path to certificate Host certificate root /etc/grid-security/hostcert.pem /etc/grid-security/hostkey.pem Here are instructions to request a host certificate. The host certificate/key is used for authorization, however, authorization between the Factory and the GlideinWMS collector is done by file system authentication.","title":"Certificates"},{"location":"services/install-gwms-factory/#networking","text":"","title":"Networking"},{"location":"services/install-gwms-factory/#firewalls","text":"It must be on the public internet, with at least one port open to the world; all worker nodes will load data from this node trough HTTP. Note that worker nodes will also need outbound access in order to access this HTTP port.","title":"Firewalls"},{"location":"services/install-gwms-factory/#installation-procedure","text":"As with all OSG software installations, there are some one-time (per host) steps to prepare in advance: Ensure the host has a supported operating system Obtain root access to the host Prepare the required Yum repositories Install CA certificates","title":"Installation Procedure"},{"location":"services/install-gwms-factory/#installing-htcondor","text":"Most required software is installed from the Factory RPM installation. HTCondor is the only exception since there are many different ways to install it , using the RPM system or not. You need to have HTCondor installed before installing the GlideinWMS Factory. If yum cannot find a HTCondor RPM, it will install the dummy empty-condor RPM, assuming that you installed HTCondor using a tarball distribution. If you don't have HTCondor already installed, you can install the HTCondor RPM from the OSG repository: root@host # yum install condor.x86_64","title":"Installing HTCondor"},{"location":"services/install-gwms-factory/#installing-htcondor-bosco","text":"If you plan to send jobs using direct batch submission (aka BOSCO), then you need also the condor-bosco package. You'll have to install the package and remove one of its files /etc/condor/config.d/60-campus_factory.config because it interferes with the Factory configuration. root@host # yum install condor-bosco root@host # rm /etc/condor/config.d/60-campus_factory.config root@host # touch /etc/condor/config.d/60-campus_factory.config","title":"Installing HTCondor-BOSCO"},{"location":"services/install-gwms-factory/#install-gwms-factory","text":"","title":"Install GWMS Factory"},{"location":"services/install-gwms-factory/#download-and-install-the-factory-rpm","text":"Install the RPM and dependencies (be prepared for a lot of dependencies). root@host # yum install glideinwms-factory This will install the current production release verified and tested by OSG with default HTCondor configuration. This command will install the GlideinWMS Factory, HTCondor, the OSG client, and all the required dependencies. If you wish to install a different version of GlideinWMS, add the \"--enablerepo\" argument to the command as follows: yum install --enablerepo=osg-testing glideinwms-factory : The most recent production release, still in testing phase. This will usually match the current tarball version on the GlideinWMS home page . (The osg-release production version may lag behind the tarball release by a few weeks as it is verified and packaged by OSG). Note that this will also take the osg-testing versions of all dependencies as well. yum install --enablerepo=osg-upcoming glideinwms-factory : The most recent development series release, ie version 3.3.x release. This has newer features such as cloud submission support, but is less tested.","title":"Download and install the Factory RPM"},{"location":"services/install-gwms-factory/#download-htcondor-tarballs","text":"You will need to download HTCondor tarballs for each architecture that you want to deploy pilots on . At this point, GlideinWMS factory does not support pulling HTCondor binaries from your system area. Suggested is that you put these binaries in /var/lib/gwms-factory/condor but any gfactory accessible location should suffice.","title":"Download HTCondor tarballs"},{"location":"services/install-gwms-factory/#configuration-procedure","text":"After installing the RPM you need to configure the components of the GlideinWMS Factory: Edit Factory configuration options Edit HTCondor configuration options Create a HTCondor grid map file Reconfigure and Start Factory","title":"Configuration Procedure"},{"location":"services/install-gwms-factory/#configuring-the-factory","text":"The configuration file is /etc/gwms-factory/glideinWMS.xml . The next steps will describe each line that you will need to edit for most cases, but you may want to review the whole file to be sure that it is configured correctly.","title":"Configuring the Factory"},{"location":"services/install-gwms-factory/#security-configuration","text":"In the security section, you will need to provide each Frontend that is allowed to communicate with the Factory: security key_length=\"2048\" pub_key=\"RSA\" remove_old_cred_age=\"30\" remove_old_cred_freq=\"24\" reuse_oldkey_onstartup_gracetime=\"900\"> <frontends> <frontend identity= \"vofrontend_service@FACTORY_COLLECTOR_HOSTNAME\" name= \"%ORANGE%vofrontend_sec_name%ENDCOLOR%\" > <security_classes> <security_class name= \"%RED%frontend_sec_class%ENDCOLOR%\" username= \"frontend\" /> </security_classes> </frontend> </frontends> </security> These attributes are very important to get exactly right or the Frontend will not be trusted. This should match one of the factory and security sections of the Frontend configuration Configuring the GlideinWMS Frontend in the following way: Note This is a snippet from the Frontend configuration (for reference), not the Factory that you are configuring now! For the factory section: # from frontend.xml <factory query_expr= '((stringListMember(\"VO\", GLIDEIN_Supported_VOs)))' > .... <collectors> <collector DN= \"/DC=org/DC=doegrids/OU=Services/CN=FACTORY_COLLECTOR_HOSTNAME\" comment= \"Define factory collector globally for simplicity\" factory_identity= \"gfactory@FACTORY_COLLECTOR_HOSTNAME\" my_identity= \"%GREEN%username%ENDCOLOR%@FACTORY_COLLECTOR_HOSTNAME\" node= \"FACTORY_COLLECTOR_HOSTNAME\" /> </collectors> </factory> For the security: # from frontend.xml <security classad_proxy= \"/tmp/vo_proxy\" proxy_DN= \"DN of vo_proxy\" proxy_selection_plugin= \"ProxyAll\" security_name= \"The security name, this is used by factory\" sym_key= \"aes_256_cbc\" > <credentials> <credential absfname= \"/tmp/pilot_proxy\" security_class= \"frontend\" trust_domain= \"OSG\" type= \"grid_proxy\" /> </credentials> </security> Note that the identity of the Frontend must match what HTCondor authenticates the DN of the frontend to. In /etc/condor/certs/condor_mapfile , there must be an entry with vofrontend_service definition (in this case): GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=Services\\/CN\\=Some\\ Name\\ 834323%ENDCOLOR%$\" % GREEN % vofrontend_service % ENDCOLOR %","title":"Security configuration"},{"location":"services/install-gwms-factory/#entry-configuration","text":"Entries are grid/cloud endpoints (aka Compute Elements, or gatekeepers) that can accept job requests and run pilots (which will run user jobs). Each entry needs to be configured to communicate to a specific gatekeeper. An example test entry is provided in the default GlideinWMS configuration file. At the very least, you will need to modify the entry line: <entry name= \"%RED%ENTRY_NAME%ENDCOLOR%\" enabled= \"True\" auth_method= \"grid_proxy\" trust_domain= \"OSG\" gatekeeper= \"%RED%gatekeeper.domain.tld/jobmanager-type%ENDCOLOR%\" gridtype= \"gt2\" rsl= \"(queue=default)(jobtype=single)\" schedd_name= \"%RED%schedd_glideins2@FACTORY_HOSTNAME%ENDCOLOR%\" verbosity= \"std\" work_dir= \"OSG\" > You will need to modify the entry name and gatekeeper . This will determine the gatekeeper that you access. Specific gatekeepers often require specific \"rsl\" attributes that determine the job queue that you are submitting to, or other attributes. Add them in the rsl attribute. Also, be sure to distribute your entries across the various HTCondor schedd work managers to balance load. To see the available schedd use condor_status -schedd -l | grep Name . Several schedd options are configured by default for you: schedd_glideins2, schedd_glideins3, schedd_glideins4, schedd_glideins5 , as well as the default schedd . This can be modified in the HTCondor configuration. Add any specific options, such as limitations on jobs/pilots or glexec/voms requirements in the entry section below the above line. More details are in the GlideinWMS Factory configuration guide . !!! warning If there is no match between auth_metod and trust_domain of the entry and the type and trust_domain listed in one of the credentials of one of the Frontends using this Factory, then no job can run on that entry. The Factory must advertise the correct Resource Name of each entry for accounting purposes. Then the Factory must also advertise in the entry all the attributes that will allow to match the query expression used in the Frontends connecting to this Factory (e.g. <factory query_expr='((stringListMember(\"%PINK%VO%ENDCOLOR%\", GLIDEIN_Supported_VOs)))'> as explained in the VO frontend configuration document ). Note Keep an eye on this part as we're dealing with singularity. Then you must advertise correctly if the site supports gLExec . If it does not set GLEXEC_BIN to NONE , if gLExec is installed via OSG set it to OSG , otherwise set it to the path of gLExec. For example this snippet advertises GLIDEIN_Supported_VOs attribute with the supported VO so that can be used with the query above in the VO frontend and says that the resource does not support gLExec: <entry name= \"RESOURCE_NAME\" ... <config > ... <attrs> ... <attr name= \"GLIDEIN_Supported_VOs\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%PINK%VO%ENDCOLOR%\" /> <attr name= \"GLEXEC_BIN\" const= \"True\" glidein_publish= \"False\" job_publish= \"False\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%NONE%ENDCOLOR%\" /> <attr name= \"GLIDEIN_Resource_Name\" const= \"True\" glidein_publish= \"True\" job_publish= \"True\" parameter= \"True\" publish= \"True\" type= \"string\" value= \"%RED%SiteNameFromOIM%ENDCOLOR%\" /> </attrs> Note Specially if jobs are sent to OSG resources, it is very important to set the GLIDEIN_Resource_Name and to be consistent with the Resource Name reported in OIM because that name will be used for job accounting in Gratia. It should be the name of the Resource in OIM or the name of the Resource Group (specially if there are many gatekeepers submitting to the same cluster). More information on options can be found here","title":"Entry configuration"},{"location":"services/install-gwms-factory/#configuring-tarballs","text":"Each pilot will download HTCondor binaries from the staging area. Often, multiple binaries are needed to support various architectures and platforms. Currently, you will need to provide at least one tarball for GlideinWMS to use. (Using the system binaries is currently not supported). Download a HTCondor tarball from here . Suggested is to put the binaries in /var/lib/gwms-factory/condor , but any factory-accessible location will do just fine. Once you have downloaded the tarball, configure it in /etc/gwms-factory/glideinWMS.xml like in the following: <condor_tarball arch= \"default\" base_dir= \"/var/lib/gwms-factory/condor/condor-8.7.6-x86_64_RedHat6-stripped\" os= \"rhel6\" version= \"default\" /> Remember also to modify the condor_os and condor_arch attributes in the entries (the configured Compute Elements) to pick the correct HTCondor binary. Here are more details on using multiple HTCondor binaries. Note that is sufficient to set the base_dir ; the reconfigure command will prepare the tarball and add it to the XML config file.","title":"Configuring Tarballs"},{"location":"services/install-gwms-factory/#configuring-htcondor","text":"The HTCondor configuration for the Factory is placed in /etc/condor/config.d . 00_gwms_factory_general.config 00-restart_peaceful.config 01_gwms_factory_collectors.config 02_gwms_factory_schedds.config 03_gwms_local.config 10-batch_gahp_blahp.config Get rid of the pre-loaded HTCondor default root@host # rm /etc/condor/config.d/00personal_condor.config root@host # touch /etc/condor/config.d/00personal_condor.config For most installations, the items you need to modify are in 03_gwms_factory_local.config . The lines you will have to edit are: Credentials of the machine. You can either run using a proxy, or a service certificate. It is recommended to use a host certificate and specify its location in the variables GSI_DAEMON_CERT and GSI_DAEMON_KEY . The host certificate should be owned by root and have the correct permissions, 600. HTCondor ids in the form UID.GID (both are integers) HTCondor admin email. Will receive messages when services fail. #-- HTCondor user: condor CONDOR_IDS = #-- Contact (via email) when problems occur CONDOR_ADMIN = ############################ # GSI Security config ############################ #-- Grid Certificate directory GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates #-- Credentials GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem #-- HTCondor mapfile CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile ################################### # Whitelist of HTCondor daemon DNs ################################### #DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD, STARTD","title":"Configuring HTCondor"},{"location":"services/install-gwms-factory/#using-other-htcondor-rpms-eg-uw-madison-htcondor-rpm","text":"The above procedure will work if you are using the OSG HTCondor RPMS. You can verify that you used the OSG HTCondor RPM by using yum list condor . The version name should include \"osg\", e.g. 8.6.9-1.1.osg34.el7 . If you are using the UW Madison HTCondor RPMS, be aware of the following changes: This HTCondor RPM uses a file /etc/condor/condor_config.local to add your local machine slot to the user pool. If you want to disable this behavior (recommended), you should blank out that file or comment out the line in /etc/condor/condor_config for LOCAL_CONFIG_FILE. (Make sure that LOCAL_CONFIG_DIR is set to /etc/condor/config.d ) Note that the variable LOCAL_DIR is set differently in UW Madison and OSG RPMs. This should not cause any more problems in the Glideinwms RPMs, but please take note if you use this variable in your job submissions or other customizations. In general if you are using a non OSG RPM or if you added custom configuration files for HTCondor please check the order of the configuration files: root@host # condor_config_val -config Configuration source: /etc/condor/condor_config Local configuration sources: /etc/condor/config.d/00-restart_peaceful.config /etc/condor/config.d/00_gwms_factory_general.config /etc/condor/config.d/01_gwms_factory_collectors.config /etc/condor/config.d/02_gwms_factory_schedds.config /etc/condor/config.d/03_gwms_local.config /etc/condor/config.d/10-batch_gahp_blahp.config /etc/condor/condor_config.local","title":"Using other HTCondor RPMs, e.g. UW Madison HTCondor RPM"},{"location":"services/install-gwms-factory/#restarting-htcondor","text":"After configuring HTCondor, be sure to restart HTCondor: root@host # service condor restart","title":"Restarting HTCondor"},{"location":"services/install-gwms-factory/#create-a-htcondor-grid-mapfile","text":"The HTCondor grid mapfile /etc/condor/certs/condor_mapfile is used for authentication between the glidein running on a remote worker node, and the local collector. HTCondor uses the mapfile to map certificates to pseudo-users on the local machine. It is important that you map the DN's of each frontend you are talking to. Below is an example mapfile, by default found in /etc/condor/certs/condor_mapfile : GSI \"^\\/DC\\=org\\/DC\\=doegrids\\/OU\\=People\\/CN\\=Some\\ Name\\ 123456$\" frontend GSI (.*) anonymous FS (.*) \\1 Each frontend needs a line that maps to the user specified in the identity argument in the frontend security section of the Factory configuration.","title":"Create a HTCondor grid mapfile."},{"location":"services/install-gwms-factory/#reconfiguring-glideinwms","text":"After changing the configuration of GlideinWMS and making sure that Factory is running, use the following table to find the appropriate command for your operating system (run as root ): If your operating system is... Run the following command... Enterprise Linux 7 systemctl reload gwms-factory Enterprise Linux 6 service gwms-factory reconfig Note Notice that, in the case of Enterprise Linux 7 systemctl reload gwms-factory will work only if: - gwms-factory service is running - gwms-factory service was started with systemctl Otherwise, you will get the following error in any of the cases: # systemctl reload gwms-factory Job for gwms-factory.service invalid.","title":"Reconfiguring GlideinWMS"},{"location":"services/install-gwms-factory/#upgrading-glideinwms","text":"Before you start the Factory service for the first time or after an update of the RPM or after you change GlideinWMS scripts, you should always use the GlideinWMS \"upgrade\" command. To do so: Make sure the condor and gwms-factory services are stopped (in EL6 this will be done for you). Issue the upgrade command: If you are using Enterprise Linux 7: root@host # /usr/sbin/gwms-factory upgrade If you are using Enterprise Linux 6: root@host # service gwms-factory upgrade Start the condor and gwms-factory services (see next part).","title":"Upgrading GlideinWMS"},{"location":"services/install-gwms-factory/#service-activation-and-deactivation","text":"To start the Factory you must start also HTCondor and the Web server beside the Factory itself: # %RED%For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service condor start root@host # service httpd start root@host # service gwms-factory start # %RED% For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl start condor root@host # systemctl start httpd root@host # systemctl start gwms-factory Note Once you successfully start using the Factory service, anytime you change the /etc/gwms-factory/glideinWMS.xml file you will need to run a reconfig/reload command. If you change also some code you need the upgrade command mentioned above: # %RED% For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # service gwms-factory reconfig # %RED% But the situation is a bit more complicated in RHEL 7 , CentOS 7 , and SL7 due to systemd restrictions%ENDCOLOR% # %GREEN% For reconfig:%ENDCOLOR% A. %RED% when the Factory is running%ENDCOLOR% A.1 %RED% without any additional options%ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig%ENDCOLOR% or root@host # systemctl reload gwms-factory A.2 %RED% if you want to give additional options %ENDCOLOR% systemctl stop gwms-factory /usr/sbin/gwms-factory reconfig \"and your options\" systemctl start gwms-factory B. %RED% when the Factory is NOT running %ENDCOLOR% root@host # /usr/sbin/gwms-factory reconfig ( \"and your options\" ) To enable the services so that they restart after a reboot: # %RED%# For RHEL 6 , CentOS 6 , and SL6%ENDCOLOR% root@host # /sbin/chkconfig fetch-crl-cron on root@host # /sbin/chkconfig fetch-crl-boot on root@host # /sbin/chkconfig condor on root@host # /sbin/chkconfig httpd on root@host # /sbin/chkconfig gwms-factory on # %RED%# For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl enable fetch-crl-cron root@host # systemctl enable fetch-crl-boot root@host # systemctl enable condor root@host # systemctl enable httpd root@host # systemctl enable gwms-factory To stop the Factory: # %RED%For RHEL 6 , CentOS 6 , and SL6 %ENDCOLOR% root@host # service gwms-factory stop # %RED%For RHEL 7 , CentOS 7 , and SL7%ENDCOLOR% root@host # systemctl stop gwms-factory And you can stop also the other services if you are not using them independently of the Factory.","title":"Service Activation and Deactivation"},{"location":"services/install-gwms-factory/#validating-glideinwms-factory","text":"The complete validation of the Factory is the submission of actual jobs. You can also check that the services are up and running: root@host # condor_status -any MyType TargetType Name glidefactoryclient None 12345_TEST_ENTRY@gfactory_instance@ glideclient None 12345_TEST_ENTRY@gfactory_instance@ glidefactory None TEST_ENTRY@gfactory_instance@ glidefactoryglobal None gfactory_instance@gfactory_ser glideclientglobal None gfactory_instance@gfactory_ser Scheduler None hostname.fnal.gov DaemonMaster None hostname.fnal.gov Negotiator None hostname.fnal.gov Scheduler None schedd_glideins2@hostname Scheduler None schedd_glideins3@hostname Scheduler None schedd_glideins4@hostname Scheduler None schedd_glideins5@hostname Collector None wmscollector_service@hostname You should have one \"glidefactory\" classAd for each entry that you have enabled. If you have already configured the frontends, you will also have one glidefactoryclient and one glideclient classAd for each frontend / entry. You can check also the monitoring Web page: http://YOUR_HOST_FQDN/factory/monitor/ You can also test the local submission of a job to a resource using the test script local_start.sh but you must first install the OSG client tools and generate a proxy. After that you can run the test (replace ENTRY_NAME with the name of one of the entries in /etc/gwms-factory/glideinWMS.xml ):","title":"Validating GlideinWMS Factory"},{"location":"services/install-gwms-factory/#check-web-server-configuration-for-the-monitoring","text":"Verify path and specially the URL for the GlideinWMS files served by your web server: stage base_dir = \"/var/lib/gwms-factory/web-area/stage\" use_symlink = \"True\" web_base_url = \"http://HOSTNAME:PORT/factory/stage\" This will determine the location of your web server . Make sure that the URL is visible. Depending on your firewall or the one of your organization, you may need to change the port here and in the httpd configuration (by modifying the \"Listen\" directive in /etc/httpd/conf/httpd.conf ). Note that web servers are an often an attacked piece of infrastruture, so you may want to go through the Apache configuration in /etc/httpd/conf/httpd.conf and disable unneeded modules.","title":"Check Web server configuration for the monitoring"},{"location":"services/install-gwms-factory/#troubleshooting-glideinwms-factory","text":"","title":"Troubleshooting GlideinWMS Factory"},{"location":"services/install-gwms-factory/#file-locations","text":"File Description File Location Comment Configuration file /etc/gwms-factory/glideinWMS.xml Main configuration file Logs /var/log/gwms-factory/server/factory Overall server logs /var/log/gwms-factory/server/entry_NAME Specific entry logs (generally more useful) /var/log/gwms-factory/client Glidein Pilot logs seperated by user and entry Startup script /etc/init.d/gwms-factory Web Directory /var/lib/gwms-factory/web-area Web Base /var/lib/gwms-factory/web-base Working Directory /var/lib/gwms-factory/work-dir/","title":"File Locations"},{"location":"services/install-gwms-factory/#increase-the-log-level-and-change-rotation-policies","text":"You can increase the log level of the frontend. To add a log file with all the log information add the following line with all the message types in the process_log section of /etc/gwms-factory/glideinWMS.xml : <log_retention> <process_logs> <process_log extension= \"all\" max_days= \"7.0\" max_mbytes= \"100.0\" min_days= \"3.0\" msg_types= \"DEBUG,EXCEPTION,INFO,ERROR,ERR\" /> You can also change the rotation policy and choose whether compress the rotated files, all in the same section of the config files: max_bytes is the max size of the log files max_days it will be rotated. compression specifies if rotated files are compressed backup_count is the number of rotated log files kept Further details are in the reference documentation .","title":"Increase the log level and change rotation policies"},{"location":"services/install-gwms-factory/#failed-authentication-errors","text":"If you get messages such as these in the logs, the Factory does not trust the frontend and will not submit glideins. WARNING: Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) not in white list. Skipping request This error means that the frontend name in the security section of the Factory does not match the security_name in the frontend. Client fermicloud128-fnal-gov_OSG_gWMSFrontend.main (secid: frontend_name) is not coming from a trusted source; AuthenticatedIdentity vofrontend_condor@fermicloud130.fnal.gov!=vofrontend_factory@fermicloud130.fnal.gov. Skipping for security reasons. This error means that the identity in the security section of the Factory does not match what the /etc/condor/certs/condor_mapfile authenticates the Frontend to in HTCondor (!Authenticated Identity in the classad). Make sure the attributes are correctly lined up as in the Frontend security configuration section above.","title":"Failed authentication errors"},{"location":"services/install-gwms-factory/#glideins-start-but-do-not-connect-to-user-pool-vo-frontend","text":"Check the appropriate job err and out logs in /var/log/gwms-factory/client to see if any errors were reported. Often, this will be a pilot unable to access a web server or with an invalid proxy. Also, verify that the condor_mapfile is correct on the VO Frontend's user pool collector and configuration.","title":"Glideins start but do not connect to User pool / VO Frontend"},{"location":"services/install-gwms-factory/#glideins-start-but-fail-before-running-job-with-error-proxy-not-long-lived-enough","text":"If the glideins are running on a resource (entry) but the jobs are not running and the log files in /var/log/gwms-factory/client/user_frontend/glidein_gfactory_instance/ENTRY_NAME report an error like \"Proxy not long lived enough (86096 s left), shortened retire time ...\", then probably the HTCondor RLM on the Compute Element is delegating the proxy and shortening its lifespan. This can be fixed by setting DELEGATE_JOB_GSI_CREDENTIALS = FALSE as suggested in the CE install document .","title":"Glideins start but fail before running job with error \"Proxy not long lived enough\""},{"location":"services/install-gwms-factory/#references","text":"http://glideinwms.fnal.gov/doc.prd/ https://opensciencegrid.org/docs/other/install-gwms-frontend/","title":"References"},{"location":"services/sending-announcements/","text":"Sending Announcements Various OSG teams need to send out announcement about various events (releases, security advisories, planned changes, etc). This page describes how to send announcements using the osg-notify tool. Prerequisites To send announcements, the following conditions must be met: A host with an IP address listed in the SPF Record A sufficiently modern Linux operating system. This procedure has been tested on a FermiCloud Scientific Linux 7 VM and a Linux Mint 18.3 laptop. It is known not to work on a FermiCloud Scientific Linux 6 VM. A valid OSG user certificate to lookup contacts in the topology database Local hostname matches DNS DNS forward and reverse lookups in place [tim@submit-1 topology]$ hostname submit-1.chtc.wisc.edu [tim@submit-1 topology]$ host submit-1.chtc.wisc.edu submit-1.chtc.wisc.edu has address 128.105.244.191 [tim@submit-1 topology]$ host 128 .105.244.191 191.244.105.128.in-addr.arpa domain name pointer submit-1.chtc.wisc.edu. (Required for security announcements) A GPG Key to sign the announcement Installation Install the required Yum repositories : Install the OSG tools: # yum install --enablerepo = devops topology-client If you are on a FermiCloud VM, update postfix to relay through FermiLab's official mail server: echo \"transport_maps = hash:/etc/postfix/transport\" >> /etc/postfix/main.cf echo \"* smtp:smtp.fnal.gov\" >> /etc/postfix/transport postmap hash:/etc/postfix/transport postfix reload Test this setup by sending a message to yourself only. Bonus points for using an email address that goes to a site with aggressive SPAM filtering. Sending the announcement Use the osg-notify tool to send the announcement using the relevant options from the following table: Option Description --dry-run Use this option until you are ready to actually send the message --cert <FILE> File that contains your OSG User Certificate --key <FILE> File that contains your Private Key for your OSG User Certificate --no-sign Don't GPG sign the message (release only) --type production Not a test message --message <FILE> File containing your message --subject <EMAIL SUBJECT> The subject of your message --recipients <LIST OF EMAILS> List of recipient email addresses, must have at least one --oim-recipients <resources|vos> Select contacts associated with resources and/or VOs --oim-contact-type <TYPE> Replacing <TYPE> with administrative for release announcements or security for security announcements --bypass-dns-check Use this option to skip the check that one of the host's IP addresses matches with the hostname resolution Security requirements Security announcements must be signed using the following options: --sign : GPG sign the message --sign-id <KEYID> : The ID of the key used for singing --from security : The mail comes from the OSG Security Team For release announcements use the following command: osg-notify --cert your-cert.pem --key your-key.pem \\ --no-sign --type production --message <PATH TO MESSAGE FILE> \\ --subject '<EMAIL SUBJECT>' \\ --recipients \"osg-general@opensciencegrid.org osg-operations@opensciencegrid.org osg-sites@opensciencegrid.org vdt-discuss@opensciencegrid.org\" \\ --oim-recipients resources --oim-recipients vos --oim-contact-type administrative Replacing <EMAIL SUBJECT> with an appropriate subject for your announcement and <PATH TO MESSAGE FILE> with the path to the file containing your message in plain text.","title":"Sending Announcements"},{"location":"services/sending-announcements/#sending-announcements","text":"Various OSG teams need to send out announcement about various events (releases, security advisories, planned changes, etc). This page describes how to send announcements using the osg-notify tool.","title":"Sending Announcements"},{"location":"services/sending-announcements/#prerequisites","text":"To send announcements, the following conditions must be met: A host with an IP address listed in the SPF Record A sufficiently modern Linux operating system. This procedure has been tested on a FermiCloud Scientific Linux 7 VM and a Linux Mint 18.3 laptop. It is known not to work on a FermiCloud Scientific Linux 6 VM. A valid OSG user certificate to lookup contacts in the topology database Local hostname matches DNS DNS forward and reverse lookups in place [tim@submit-1 topology]$ hostname submit-1.chtc.wisc.edu [tim@submit-1 topology]$ host submit-1.chtc.wisc.edu submit-1.chtc.wisc.edu has address 128.105.244.191 [tim@submit-1 topology]$ host 128 .105.244.191 191.244.105.128.in-addr.arpa domain name pointer submit-1.chtc.wisc.edu. (Required for security announcements) A GPG Key to sign the announcement","title":"Prerequisites"},{"location":"services/sending-announcements/#installation","text":"Install the required Yum repositories : Install the OSG tools: # yum install --enablerepo = devops topology-client If you are on a FermiCloud VM, update postfix to relay through FermiLab's official mail server: echo \"transport_maps = hash:/etc/postfix/transport\" >> /etc/postfix/main.cf echo \"* smtp:smtp.fnal.gov\" >> /etc/postfix/transport postmap hash:/etc/postfix/transport postfix reload Test this setup by sending a message to yourself only. Bonus points for using an email address that goes to a site with aggressive SPAM filtering.","title":"Installation"},{"location":"services/sending-announcements/#sending-the-announcement","text":"Use the osg-notify tool to send the announcement using the relevant options from the following table: Option Description --dry-run Use this option until you are ready to actually send the message --cert <FILE> File that contains your OSG User Certificate --key <FILE> File that contains your Private Key for your OSG User Certificate --no-sign Don't GPG sign the message (release only) --type production Not a test message --message <FILE> File containing your message --subject <EMAIL SUBJECT> The subject of your message --recipients <LIST OF EMAILS> List of recipient email addresses, must have at least one --oim-recipients <resources|vos> Select contacts associated with resources and/or VOs --oim-contact-type <TYPE> Replacing <TYPE> with administrative for release announcements or security for security announcements --bypass-dns-check Use this option to skip the check that one of the host's IP addresses matches with the hostname resolution Security requirements Security announcements must be signed using the following options: --sign : GPG sign the message --sign-id <KEYID> : The ID of the key used for singing --from security : The mail comes from the OSG Security Team For release announcements use the following command: osg-notify --cert your-cert.pem --key your-key.pem \\ --no-sign --type production --message <PATH TO MESSAGE FILE> \\ --subject '<EMAIL SUBJECT>' \\ --recipients \"osg-general@opensciencegrid.org osg-operations@opensciencegrid.org osg-sites@opensciencegrid.org vdt-discuss@opensciencegrid.org\" \\ --oim-recipients resources --oim-recipients vos --oim-contact-type administrative Replacing <EMAIL SUBJECT> with an appropriate subject for your announcement and <PATH TO MESSAGE FILE> with the path to the file containing your message in plain text.","title":"Sending the announcement"},{"location":"services/topology-contacts-data/","text":"Topology and Contacts Data This is internal documentation intended for OSG Operations staff. It contains information about the data provided by https://topology.opensciencegrid.org . The topology data for the service is in https://github.com/opensciencegrid/topology , in the projects/ , topology/ , and virtual-organizations/ subdirectories. The contacts data is in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . Topology Data Admins may request changes to data in the topology repo via either a GitHub pull request or a Freshdesk ticket. These changes can be to a project, a VO, or a resource. The registration document and topology README document should tell them how to do that. In the case of a GitHub pull request, you will need to provide IDs using the bin/next_ids tool in an up-to-date local clone of Topology and potentially fix-up other data. To assist the user, do one of the following, depending on the severity of the fixes required for the PR: For minor issues, submit a \"Comment\" review using GitHub suggestions and ask the user to incorporate your suggestions . For major issues, create a branch based off of their PR, make changes, and submit your own PR that closes the original user's PR. The CI checks should catch most errors but you should still review the YAML changes. Certain things to check are: Do contact names and IDs match what's in the contacts data? (See below for instructions on how to get that information.) If the person is not in the contacts data, you will need to add them before approving the PR. Is the PR submitter authorized to make changes to that project/VO/resource? Can you match them to a person affiliated with that project/VO/site? (The contacts data now includes the GitHub usernames for some people. See below for instructions on how to get that information.) Is their GitHub ID registered in the contact database and are they associated with the relevant resource, site, facility, or VO? Retiring resources A resource can be disabled in its topology yaml file by setting Active: false . However the resource entry should not be immediately deleted from the yaml file. One reason for this is that the WLCG accounting info configured for resources is used to determine which resources to send APEL numbers for. Removing resources prematurely could prevent resummarized GRACC data from getting sent appropriately. Resources that have been inactive for at least two years are eligible to be deleted from the topology database. The GRACC records for this resource can be inspected in Kibana . In the search bar, enter ProbeName:*\\:FQDN in the search bar, where FQDN is the FQDN defined for your resource For example, if your resource FQDN is cmsgrid01.hep.wisc.edu you would enter ProbeName:*\\:cmsgrid01.hep.wisc.edu In the upper-right corner, use the Time Range selection to pick \"Last 2 years\" With this criteria selected, Kibana will show you if it has received any records for this resource in the past two years. If there are no records returned, you may remove the resource from the resource group yaml file in the topology repo. Any downtime entries for this resource in the corresponding downtime yaml file for the resource group must be removed also. If you remove the last resource in the resource group yaml file, you should remove the resource group and corresponding downtime yaml files as well. Reviewing project PRs New projects are typically created by the Research Facilitation team. Here are a few things to check: Did osg-bot warn about a \"New Organization\"? If so, search around in the projects directory and make sure the \"Organization\" in the YAML is not a typo or alternate spelling for an existing organization. grep around in the /projects/ directory for substrings of the organization. For example, if the new org is \"University of Wisconsin Madison\", do: $ grep -i wisconsin projects/*.yaml and you will see that it's supposed to be \"University of Wisconsin-Madison\". If the new organization is not a typo or alternate spelling, dismiss osg-bot's review with the comment \"new org is legit\". - If osg-bot included a message about \"Unrecognized InstitutionID\" alongside this warning, check that the \"InstitutionID\" field in the project contains an ID that's found in the OSG institutions database . The topology project web-form should automatically populate the InstitutionID field for known institutions, but may fail in the case of spelling discrepancies. - If the organization is absent from the institutions database, add a comment to the pull request mentioning @opensciencegrid/project-office to request permission to add the institution. Once you have obtained permission, search for the institution's canonical name in the Research Organization Registry before adding it to the database. - Is the project name is of the form <INSTITUTION>_<PINAME> , e.g. UWMadison_Parks ? (This is recommended but not required for new projects.) If so: Is the short name -> organization mapping for the institution in /mappings/project_institution.yaml (e.g. UWMadison: \"University of Wisconsin-Madison\" )? If not, ask the PR author to add it. Does the \"FieldOfScience\" in the YAML match one of the keys in /mappings/nsfscience.yaml ? (The list is also available on the left column of this CSV .) Is the \"Sponsor\" correct? The sponsor depends on where the users will be submitting jobs from: If they primarily submit from some CI Connect interface such as \"OSG Connect\", use: Sponsor : CampusGrid : Name : <CAMPUS_GRID> The campus grid name must be one of the ones in the /projects/_CAMPUS_GRIDS.yaml file.. Otherwise, the project must be sponsored by a VO: Sponsor : VirtualOrganization : Name : <VO> The VO name must be one of the ones in the /virtual-organizations/ dir. Contacts Data The OSG keeps contact data for administrators and maintainers of OSG resources and VOs for the purpose of distributing security, software, and adminstrative (e.g., OSG All-Hands dates) announcements. Additionally, OSG contacts have the following abilities: View other contacts' information (via HTML and XML ) with a registered certificate Register resource downtimes for resources that they are listed as an administrative contact, if they have a registered GitHub ID Contact data is kept as editable YAML in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . The YAML file contains sensitive information and is only visible to people with access to that repo. Getting access to the contact repo The contacts repo is hosted on BitBucket. You will need an Atlassian account for access to BitBucket. The account you use for OSG JIRA should work. Once you have an account, request access from Brian Lin, Mat Selmeci, or Derek Weitzel. You should then be able to go to https://bitbucket.org/opensciencegrid/contact/ . Using the contact repo BitBucket is similar to GitHub except you don't make a fork of the contact repo, you just clone it to your local machine. This means that any pushes go directly to the main repo instead of your own fork. Danger Don't push to master. For any changes, always create your own branch, push your changes to that branch, then make a pull request. Have someone else review and merge your pull request. All contact data is stored in contacts.yaml . The contact info is keyed by a 40-character hexadecimal ID which was generated from their email address when they were first added. An example entry is: 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : # ^ this is their ID FullName : Example A. User Profile : This is an example user. GitHub : ExampleUser # ContactInformation data requires authorization to view ContactInformation : DNs : - ... IM : ... PrimaryEmail : user@example.net PrimaryPhone : ... When making changes to the contact data, first see if a contact is already in the YAML file. Search the YAML file for their name. Be sure to try variations of their name if you don't find them -- someone may be listed as \"Dave\" or \"David\", or have a middle name or middle initial. Follow the instructions below for adding or updating a contact, as appropriate. Adding a new contact Danger Any new contacts need to have their association with the OSG verified by a known contact within the relevant VO, site, or project. When registering a new contact, first obtain the required contact information . After obtaining this information and verifying their association with the OSG, fill out the values in template-contacts.yaml and add it to contacts.yaml . To get the hash used as the ID, run email-hash on their email address. For example: $ cd contact # this is your local clone of the \"contact\" repo $ bin/email-hash user@example.net 25357f62c7ab2ae11ddda1efd272bb5435dbfacb Then your new entry will look like 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : FullName : Example A. User .... The FullName and Profile fields in the main section, and the PrimaryEmail field in the ContactInformation section are required. The PrimaryEmail field in the ContactInformation section should match the hash that you used for the ID. In addition, if they will be making pull requests against the topology repo, e.g. for updating site information, reporting downtime, or updating project or VO information, obtain their GitHub username and put it in the GitHub field. Editing a contact Once you have found a contact in the YAML file, edit the attributes by hand. If you want to add information that is not present for that contact, look at template-contacts.yaml to find out what the attributes are called. Note The ID of the contact never changes, even if the user's PrimaryEmail changes. Important If you change the contact's FullName , you must make the same change to every place that the contact is mentioned in the topology repo. Get the contact changes merged in first.","title":"Topology and Contacts Data"},{"location":"services/topology-contacts-data/#topology-and-contacts-data","text":"This is internal documentation intended for OSG Operations staff. It contains information about the data provided by https://topology.opensciencegrid.org . The topology data for the service is in https://github.com/opensciencegrid/topology , in the projects/ , topology/ , and virtual-organizations/ subdirectories. The contacts data is in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml .","title":"Topology and Contacts Data"},{"location":"services/topology-contacts-data/#topology-data","text":"Admins may request changes to data in the topology repo via either a GitHub pull request or a Freshdesk ticket. These changes can be to a project, a VO, or a resource. The registration document and topology README document should tell them how to do that. In the case of a GitHub pull request, you will need to provide IDs using the bin/next_ids tool in an up-to-date local clone of Topology and potentially fix-up other data. To assist the user, do one of the following, depending on the severity of the fixes required for the PR: For minor issues, submit a \"Comment\" review using GitHub suggestions and ask the user to incorporate your suggestions . For major issues, create a branch based off of their PR, make changes, and submit your own PR that closes the original user's PR. The CI checks should catch most errors but you should still review the YAML changes. Certain things to check are: Do contact names and IDs match what's in the contacts data? (See below for instructions on how to get that information.) If the person is not in the contacts data, you will need to add them before approving the PR. Is the PR submitter authorized to make changes to that project/VO/resource? Can you match them to a person affiliated with that project/VO/site? (The contacts data now includes the GitHub usernames for some people. See below for instructions on how to get that information.) Is their GitHub ID registered in the contact database and are they associated with the relevant resource, site, facility, or VO?","title":"Topology Data"},{"location":"services/topology-contacts-data/#retiring-resources","text":"A resource can be disabled in its topology yaml file by setting Active: false . However the resource entry should not be immediately deleted from the yaml file. One reason for this is that the WLCG accounting info configured for resources is used to determine which resources to send APEL numbers for. Removing resources prematurely could prevent resummarized GRACC data from getting sent appropriately. Resources that have been inactive for at least two years are eligible to be deleted from the topology database. The GRACC records for this resource can be inspected in Kibana . In the search bar, enter ProbeName:*\\:FQDN in the search bar, where FQDN is the FQDN defined for your resource For example, if your resource FQDN is cmsgrid01.hep.wisc.edu you would enter ProbeName:*\\:cmsgrid01.hep.wisc.edu In the upper-right corner, use the Time Range selection to pick \"Last 2 years\" With this criteria selected, Kibana will show you if it has received any records for this resource in the past two years. If there are no records returned, you may remove the resource from the resource group yaml file in the topology repo. Any downtime entries for this resource in the corresponding downtime yaml file for the resource group must be removed also. If you remove the last resource in the resource group yaml file, you should remove the resource group and corresponding downtime yaml files as well.","title":"Retiring resources"},{"location":"services/topology-contacts-data/#reviewing-project-prs","text":"New projects are typically created by the Research Facilitation team. Here are a few things to check: Did osg-bot warn about a \"New Organization\"? If so, search around in the projects directory and make sure the \"Organization\" in the YAML is not a typo or alternate spelling for an existing organization. grep around in the /projects/ directory for substrings of the organization. For example, if the new org is \"University of Wisconsin Madison\", do: $ grep -i wisconsin projects/*.yaml and you will see that it's supposed to be \"University of Wisconsin-Madison\". If the new organization is not a typo or alternate spelling, dismiss osg-bot's review with the comment \"new org is legit\". - If osg-bot included a message about \"Unrecognized InstitutionID\" alongside this warning, check that the \"InstitutionID\" field in the project contains an ID that's found in the OSG institutions database . The topology project web-form should automatically populate the InstitutionID field for known institutions, but may fail in the case of spelling discrepancies. - If the organization is absent from the institutions database, add a comment to the pull request mentioning @opensciencegrid/project-office to request permission to add the institution. Once you have obtained permission, search for the institution's canonical name in the Research Organization Registry before adding it to the database. - Is the project name is of the form <INSTITUTION>_<PINAME> , e.g. UWMadison_Parks ? (This is recommended but not required for new projects.) If so: Is the short name -> organization mapping for the institution in /mappings/project_institution.yaml (e.g. UWMadison: \"University of Wisconsin-Madison\" )? If not, ask the PR author to add it. Does the \"FieldOfScience\" in the YAML match one of the keys in /mappings/nsfscience.yaml ? (The list is also available on the left column of this CSV .) Is the \"Sponsor\" correct? The sponsor depends on where the users will be submitting jobs from: If they primarily submit from some CI Connect interface such as \"OSG Connect\", use: Sponsor : CampusGrid : Name : <CAMPUS_GRID> The campus grid name must be one of the ones in the /projects/_CAMPUS_GRIDS.yaml file.. Otherwise, the project must be sponsored by a VO: Sponsor : VirtualOrganization : Name : <VO> The VO name must be one of the ones in the /virtual-organizations/ dir.","title":"Reviewing project PRs"},{"location":"services/topology-contacts-data/#contacts-data","text":"The OSG keeps contact data for administrators and maintainers of OSG resources and VOs for the purpose of distributing security, software, and adminstrative (e.g., OSG All-Hands dates) announcements. Additionally, OSG contacts have the following abilities: View other contacts' information (via HTML and XML ) with a registered certificate Register resource downtimes for resources that they are listed as an administrative contact, if they have a registered GitHub ID Contact data is kept as editable YAML in https://bitbucket.org/opensciencegrid/contact/ , in contacts.yaml . The YAML file contains sensitive information and is only visible to people with access to that repo.","title":"Contacts Data"},{"location":"services/topology-contacts-data/#getting-access-to-the-contact-repo","text":"The contacts repo is hosted on BitBucket. You will need an Atlassian account for access to BitBucket. The account you use for OSG JIRA should work. Once you have an account, request access from Brian Lin, Mat Selmeci, or Derek Weitzel. You should then be able to go to https://bitbucket.org/opensciencegrid/contact/ .","title":"Getting access to the contact repo"},{"location":"services/topology-contacts-data/#using-the-contact-repo","text":"BitBucket is similar to GitHub except you don't make a fork of the contact repo, you just clone it to your local machine. This means that any pushes go directly to the main repo instead of your own fork. Danger Don't push to master. For any changes, always create your own branch, push your changes to that branch, then make a pull request. Have someone else review and merge your pull request. All contact data is stored in contacts.yaml . The contact info is keyed by a 40-character hexadecimal ID which was generated from their email address when they were first added. An example entry is: 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : # ^ this is their ID FullName : Example A. User Profile : This is an example user. GitHub : ExampleUser # ContactInformation data requires authorization to view ContactInformation : DNs : - ... IM : ... PrimaryEmail : user@example.net PrimaryPhone : ... When making changes to the contact data, first see if a contact is already in the YAML file. Search the YAML file for their name. Be sure to try variations of their name if you don't find them -- someone may be listed as \"Dave\" or \"David\", or have a middle name or middle initial. Follow the instructions below for adding or updating a contact, as appropriate.","title":"Using the contact repo"},{"location":"services/topology-contacts-data/#adding-a-new-contact","text":"Danger Any new contacts need to have their association with the OSG verified by a known contact within the relevant VO, site, or project. When registering a new contact, first obtain the required contact information . After obtaining this information and verifying their association with the OSG, fill out the values in template-contacts.yaml and add it to contacts.yaml . To get the hash used as the ID, run email-hash on their email address. For example: $ cd contact # this is your local clone of the \"contact\" repo $ bin/email-hash user@example.net 25357f62c7ab2ae11ddda1efd272bb5435dbfacb Then your new entry will look like 25357f62c7ab2ae11ddda1efd272bb5435dbfacb : FullName : Example A. User .... The FullName and Profile fields in the main section, and the PrimaryEmail field in the ContactInformation section are required. The PrimaryEmail field in the ContactInformation section should match the hash that you used for the ID. In addition, if they will be making pull requests against the topology repo, e.g. for updating site information, reporting downtime, or updating project or VO information, obtain their GitHub username and put it in the GitHub field.","title":"Adding a new contact"},{"location":"services/topology-contacts-data/#editing-a-contact","text":"Once you have found a contact in the YAML file, edit the attributes by hand. If you want to add information that is not present for that contact, look at template-contacts.yaml to find out what the attributes are called. Note The ID of the contact never changes, even if the user's PrimaryEmail changes. Important If you change the contact's FullName , you must make the same change to every place that the contact is mentioned in the topology repo. Get the contact changes merged in first.","title":"Editing a contact"},{"location":"services/topology/","text":"Topology Service This document contains information about the service that runs: https://topology.opensciencegrid.org https://topology-itb.opensciencegrid.org https://map.opensciencegrid.org : Generates the topology map used on OSG Display The source code for the service is in https://github.com/opensciencegrid/topology , in the src/ subdirectory. This repository also contains the public part of the data that gets served. Deployment Topology is a webapp run with Apache on the host topology.opensciencegrid.org . The ITB instance runs on the host topology-itb.opensciencegrid.org . The hosts are VMs at Nebraska; for SSH access, contact Derek Weitzel or Brian Bockelman. Installation These instructions assume an EL 7 host with the EPEL repositories available. The software will be installed into /opt/topology . A second instance for the webhook app will be installed into /opt/topology-webhook . (The ITB instance should be installed into /opt/topology-itb and /opt/topology-itb-webhook instead.) The following steps should be done as root. Install prerequisites: # yum install python36 gridsite httpd mod_ssl Clone the repository: For the production topology host: # git clone https://github.com/opensciencegrid/topology /opt/topology # git clone https://github.com/opensciencegrid/topology /opt/topology-webhook For the topology-itb host: # git clone https://github.com/opensciencegrid/topology /opt/topology-itb # git clone https://github.com/opensciencegrid/topology /opt/topology-itb-webhook Set up the virtualenv in the clone -- from /opt/topology or /opt/topology-itb : # python36 -m venv venv # . ./venv/bin/activate # pip install -r requirements-apache.txt Repeat for the webhook instance -- from /opt/topology-webhook or /opt/topology-itb-webhook . File system locations The following files/directories must exist and have the proper permissions: Location Purpose Ownership Mode /opt/topology Production software install root:root 0755 /opt/topology-itb ITB software install root:root 0755 /opt/topology-webhook Production webhook software install root:root 0755 /opt/topology-itb-webhook ITB webhook software install root:root 0755 /etc/opt/topology/config-production.py Production config root:root 0644 /etc/opt/topology/config-itb.py ITB config root:root 0644 /etc/opt/topology/bitbucket Private key for contact info repo apache:root 0600 /etc/opt/topology/bitbucket.pub Public key for contact info repo apache:root 0644 /etc/opt/topology/github Private key for pushing automerge commits topomerge:root 0600 /etc/opt/topology/github.pub Public key for pushing automerge commits topomerge:root 0644 /etc/opt/topology/github_webhook_secret GitHub webhook secret for validating webhooks topomerge:root 0600 ~apache/.ssh SSH dir for Apache apache:root 0700 ~apache/.ssh/known_hosts Known hosts file for Apache apache:root 0644 ~topomerge Home dir for topomerge Apache user topomerge:root 0755 ~topomerge/.ssh SSH dir for topomerge Apache user topomerge:root 0700 ~topomerge/.ssh/known_hosts Known hosts file for topomerge Apache user topomerge:root 0644 /var/cache/topology Checkouts of topology and contacts data for production instance apache:apache 0755 /var/cache/topology-itb Checkouts of topology and contacts data for ITB instance apache:apache 0755 /var/cache/topology-webhook Topology repo and state info for production webhook instance topomerge:topomerge 0755 /var/cache/topology-itb-webhook Topology repo and state info for ITB webhook instance topomerge:topomerge 0755 ~apache/.ssh/known_hosts must contain an entry for bitbucket.org ; use ssh-keyscan bitbucket.org to get the appropriate entry. ~topomerge/.ssh/known_hosts must contain an entry for github.com ; use ssh-keyscan github.com to get the appropriate entry. Software configuration Configuration for the main app is under /etc/opt/topology/ , in config-production.py and config-itb.py . The webhook app configuration is in config-production-webhook.py and config-itb-webhook.py . The files are in Python format and override default settings in src/webapp/default_config.py in the topology repo. HTTPD configuration is in /etc/httpd ; we use the modules mod_ssl , mod_gridsite , and mod_wsgi . The first two are installed via yum; the .so file for mod_wsgi is located in /opt/topology/venv/lib/python3.6/site-packages/mod_wsgi/server/ or /opt/topology-itb/venv/lib/python3.6/site-packages/mod_wsgi/server/ for the ITB instance. Each of the hostnames are VHosts in the apache configuration. Some special notes: https://map.opensciencegrid.org runs in the same wsgi process as the production topology, but the URL is limited to only the map code. Further, it does not use mod_gridsite so that users are not asked to present a client certificate. VHosts are configured: ServerName topology.opensciencegrid.org ServerAlias my.opensciencegrid.org myosg.opensciencegrid.org Data configuration Configuration is in /etc/opt/topology/config-production.py and config-itb.py ; and config-production-webhook.py and config-itb-webhook.py . Variable Purpose TOPOLOGY_DATA_DIR The directory containing a clone of the topology repository for data use TOPOLOGY_DATA_REPO The remote tracking repository of TOPOLOGY_DATA_DIR TOPOLOGY_DATA_BRANCH The remote tracking branch of TOPOLOGY_DATA_DIR WEBHOOK_DATA_DIR The directory containing a mirror-clone of the topology repository for webhook use WEBHOOK_DATA_REPO The remote tracking repository of WEBHOOK_DATA_DIR WEBHOOK_DATA_BRANCH The remote tracking branch of WEBHOOK_DATA_DIR WEBHOOK_STATE_DIR Directory containing webhook state information between pull request and status hooks WEBHOOK_SECRET_KEY Secret key configured on GitHub for webhook delivery CONTACT_DATA_DIR The directory containing a clone of the contact repository for data use CONTACT_DATA_REPO The remote tracking repository of CONTACT_DATA_DIR (default: \"git@bitbucket.org:opensciencegrid/contact.git\" ) CONTACT_DATA_BRANCH The remote tracking branch of CONTACT_DATA_BRANCH (default: \"master\" ) CACHE_LIFETIME Frequency of automatic data updates in seconds (default: 900 ) GIT_SSH_KEY Location of ssh public key file for git access. /etc/opt/topology/bitbucket.pub for the main app, and /etc/opt/topology/github.pub for the webhook app Puppet ensures that the production contact and topology clones are up to date with their configured remote tracking repo and branch. Puppet does not manage the ITB data directories so they need to be updated by hand during testing. GitHub Configuration for Webhook App Go to the https://github.com/opensciencegrid/topology/settings/hooks page on GitHub. There are four webhooks to set up; pull_request and status for both the topology and topology-itb hosts. Payload URL Content type Events to trigger webhook https://topology.opensciencegrid.org/webhook/status application/json Statuses https://topology.opensciencegrid.org/webhook/pull_request application/json Pull requests https://topology-itb.opensciencegrid.org/webhook/status application/json Statuses https://topology-itb.opensciencegrid.org/webhook/pull_request application/json Pull requests For each webhook, \"Secret\" should be a random 40 digit hex string, which should match the contents of the file /etc/opt/topology/github_webhook_secret (the path configured in WEBHOOK_SECRET_KEY ). The OSG's dedicated GitHub user for automating pushes is currently osg-bot . This user needs to have write access to the topology repo on GitHub. The ssh public key in /etc/opt/topology/github.pub should be registered with the osg-bot GitHub user. This can be done by logging into GitHub as osg-bot , and adding the new ssh key under the settings page. Required System Packages Currently the webhook app uses the mailx command to send email. If not already installed, install it with: :::console # yum install mailx Testing changes on the ITB instance All changes should be tested on the ITB instance before deploying to production. If you can, test them on your local machine first. These instructions assume that the code has not been merged to master. Update the ITB software installation at /opt/topology-itb and note the current branch: # cd /opt/topology-itb # git fetch --all # git status Check out the branch you are testing. If the target remote is not configured, add it : # git checkout -b <BRANCH> <REMOTE>/<BRANCH NAME> Verify that you are using the intended data associated with the code you are testing: If the data format has changed in an incompatible way, modify /etc/opt/topology/config-itb.py : Backup the ITB configuration file: # cd /etc/opt/topology # cp -p config-itb.py { ,.bak } Change the TOPOLOGY_DATA_DIR and/or CONTACT_DATA_DIR lines to point to a new directories so the previous data does not get overwritten with incompatible data. If you need to use a different branch for the data, switch to it: Check the branch of TOPOLOGY_DATA_DIR from /etc/opt/topology/config-itb.py # cd <TOPOLOGY_DATA_DIR> # git fetch --all # git status Note the previous branch, you will need this later If the target remote is not configured, add it Check out the target branch: # git checkout -b <BRANCH NAME> <REMOTE>/<BRANCH NAME> Pull any upstream changes to ensure that your branch is up to date: # git pull For updates to the webhook app, follow the above instructions for the ITB webhook instance under /opt/topology-itb-webhook and its corresponding config file, /etc/opt/topology/config-itb-webhook.py . Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log . Reverting changes Switch /opt/topology-itb to the previous branch: # cd /opt/topology-itb # git checkout <BRANCH> For updates to the webhook app, switch /opt/topology-itb-webhook to the previous master: # cd /opt/topology-itb-webhook # git checkout <BRANCH> If you made config changes to /etc/opt/topology/config-itb.py or config-itb-webhook.py , restore the backup. If you checked out a different branch for data, revert it back to the old branch. Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Updating the production instance Updating the production instance is similar to updating ITB instance. Update master on the Git clone at /opt/topology : # cd /opt/topology # git pull origin master For updates to the webhook app, update master on the Git clone at /opt/topology-webhook : # cd /opt/topology-webhook # git pull origin master Make config changes to /etc/opt/topology/config-production.py and/or config-production-webhook.py if necessary. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log . Reverting changes Switch /opt/topology to the previous master: # cd /opt/topology # ## (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> For updates to the webhook app, switch /opt/topology-webhook to the previous master: # cd /opt/topology-webhook ### (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> If you made config changes to /etc/opt/topology/config-production.py or config-production-webhook.py , revert them. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org .","title":"Topology Service"},{"location":"services/topology/#topology-service","text":"This document contains information about the service that runs: https://topology.opensciencegrid.org https://topology-itb.opensciencegrid.org https://map.opensciencegrid.org : Generates the topology map used on OSG Display The source code for the service is in https://github.com/opensciencegrid/topology , in the src/ subdirectory. This repository also contains the public part of the data that gets served.","title":"Topology Service"},{"location":"services/topology/#deployment","text":"Topology is a webapp run with Apache on the host topology.opensciencegrid.org . The ITB instance runs on the host topology-itb.opensciencegrid.org . The hosts are VMs at Nebraska; for SSH access, contact Derek Weitzel or Brian Bockelman.","title":"Deployment"},{"location":"services/topology/#installation","text":"These instructions assume an EL 7 host with the EPEL repositories available. The software will be installed into /opt/topology . A second instance for the webhook app will be installed into /opt/topology-webhook . (The ITB instance should be installed into /opt/topology-itb and /opt/topology-itb-webhook instead.) The following steps should be done as root. Install prerequisites: # yum install python36 gridsite httpd mod_ssl Clone the repository: For the production topology host: # git clone https://github.com/opensciencegrid/topology /opt/topology # git clone https://github.com/opensciencegrid/topology /opt/topology-webhook For the topology-itb host: # git clone https://github.com/opensciencegrid/topology /opt/topology-itb # git clone https://github.com/opensciencegrid/topology /opt/topology-itb-webhook Set up the virtualenv in the clone -- from /opt/topology or /opt/topology-itb : # python36 -m venv venv # . ./venv/bin/activate # pip install -r requirements-apache.txt Repeat for the webhook instance -- from /opt/topology-webhook or /opt/topology-itb-webhook .","title":"Installation"},{"location":"services/topology/#file-system-locations","text":"The following files/directories must exist and have the proper permissions: Location Purpose Ownership Mode /opt/topology Production software install root:root 0755 /opt/topology-itb ITB software install root:root 0755 /opt/topology-webhook Production webhook software install root:root 0755 /opt/topology-itb-webhook ITB webhook software install root:root 0755 /etc/opt/topology/config-production.py Production config root:root 0644 /etc/opt/topology/config-itb.py ITB config root:root 0644 /etc/opt/topology/bitbucket Private key for contact info repo apache:root 0600 /etc/opt/topology/bitbucket.pub Public key for contact info repo apache:root 0644 /etc/opt/topology/github Private key for pushing automerge commits topomerge:root 0600 /etc/opt/topology/github.pub Public key for pushing automerge commits topomerge:root 0644 /etc/opt/topology/github_webhook_secret GitHub webhook secret for validating webhooks topomerge:root 0600 ~apache/.ssh SSH dir for Apache apache:root 0700 ~apache/.ssh/known_hosts Known hosts file for Apache apache:root 0644 ~topomerge Home dir for topomerge Apache user topomerge:root 0755 ~topomerge/.ssh SSH dir for topomerge Apache user topomerge:root 0700 ~topomerge/.ssh/known_hosts Known hosts file for topomerge Apache user topomerge:root 0644 /var/cache/topology Checkouts of topology and contacts data for production instance apache:apache 0755 /var/cache/topology-itb Checkouts of topology and contacts data for ITB instance apache:apache 0755 /var/cache/topology-webhook Topology repo and state info for production webhook instance topomerge:topomerge 0755 /var/cache/topology-itb-webhook Topology repo and state info for ITB webhook instance topomerge:topomerge 0755 ~apache/.ssh/known_hosts must contain an entry for bitbucket.org ; use ssh-keyscan bitbucket.org to get the appropriate entry. ~topomerge/.ssh/known_hosts must contain an entry for github.com ; use ssh-keyscan github.com to get the appropriate entry.","title":"File system locations"},{"location":"services/topology/#software-configuration","text":"Configuration for the main app is under /etc/opt/topology/ , in config-production.py and config-itb.py . The webhook app configuration is in config-production-webhook.py and config-itb-webhook.py . The files are in Python format and override default settings in src/webapp/default_config.py in the topology repo. HTTPD configuration is in /etc/httpd ; we use the modules mod_ssl , mod_gridsite , and mod_wsgi . The first two are installed via yum; the .so file for mod_wsgi is located in /opt/topology/venv/lib/python3.6/site-packages/mod_wsgi/server/ or /opt/topology-itb/venv/lib/python3.6/site-packages/mod_wsgi/server/ for the ITB instance. Each of the hostnames are VHosts in the apache configuration. Some special notes: https://map.opensciencegrid.org runs in the same wsgi process as the production topology, but the URL is limited to only the map code. Further, it does not use mod_gridsite so that users are not asked to present a client certificate. VHosts are configured: ServerName topology.opensciencegrid.org ServerAlias my.opensciencegrid.org myosg.opensciencegrid.org","title":"Software configuration"},{"location":"services/topology/#data-configuration","text":"Configuration is in /etc/opt/topology/config-production.py and config-itb.py ; and config-production-webhook.py and config-itb-webhook.py . Variable Purpose TOPOLOGY_DATA_DIR The directory containing a clone of the topology repository for data use TOPOLOGY_DATA_REPO The remote tracking repository of TOPOLOGY_DATA_DIR TOPOLOGY_DATA_BRANCH The remote tracking branch of TOPOLOGY_DATA_DIR WEBHOOK_DATA_DIR The directory containing a mirror-clone of the topology repository for webhook use WEBHOOK_DATA_REPO The remote tracking repository of WEBHOOK_DATA_DIR WEBHOOK_DATA_BRANCH The remote tracking branch of WEBHOOK_DATA_DIR WEBHOOK_STATE_DIR Directory containing webhook state information between pull request and status hooks WEBHOOK_SECRET_KEY Secret key configured on GitHub for webhook delivery CONTACT_DATA_DIR The directory containing a clone of the contact repository for data use CONTACT_DATA_REPO The remote tracking repository of CONTACT_DATA_DIR (default: \"git@bitbucket.org:opensciencegrid/contact.git\" ) CONTACT_DATA_BRANCH The remote tracking branch of CONTACT_DATA_BRANCH (default: \"master\" ) CACHE_LIFETIME Frequency of automatic data updates in seconds (default: 900 ) GIT_SSH_KEY Location of ssh public key file for git access. /etc/opt/topology/bitbucket.pub for the main app, and /etc/opt/topology/github.pub for the webhook app Puppet ensures that the production contact and topology clones are up to date with their configured remote tracking repo and branch. Puppet does not manage the ITB data directories so they need to be updated by hand during testing.","title":"Data configuration"},{"location":"services/topology/#github-configuration-for-webhook-app","text":"Go to the https://github.com/opensciencegrid/topology/settings/hooks page on GitHub. There are four webhooks to set up; pull_request and status for both the topology and topology-itb hosts. Payload URL Content type Events to trigger webhook https://topology.opensciencegrid.org/webhook/status application/json Statuses https://topology.opensciencegrid.org/webhook/pull_request application/json Pull requests https://topology-itb.opensciencegrid.org/webhook/status application/json Statuses https://topology-itb.opensciencegrid.org/webhook/pull_request application/json Pull requests For each webhook, \"Secret\" should be a random 40 digit hex string, which should match the contents of the file /etc/opt/topology/github_webhook_secret (the path configured in WEBHOOK_SECRET_KEY ). The OSG's dedicated GitHub user for automating pushes is currently osg-bot . This user needs to have write access to the topology repo on GitHub. The ssh public key in /etc/opt/topology/github.pub should be registered with the osg-bot GitHub user. This can be done by logging into GitHub as osg-bot , and adding the new ssh key under the settings page.","title":"GitHub Configuration for Webhook App"},{"location":"services/topology/#required-system-packages","text":"Currently the webhook app uses the mailx command to send email. If not already installed, install it with: :::console # yum install mailx","title":"Required System Packages"},{"location":"services/topology/#testing-changes-on-the-itb-instance","text":"All changes should be tested on the ITB instance before deploying to production. If you can, test them on your local machine first. These instructions assume that the code has not been merged to master. Update the ITB software installation at /opt/topology-itb and note the current branch: # cd /opt/topology-itb # git fetch --all # git status Check out the branch you are testing. If the target remote is not configured, add it : # git checkout -b <BRANCH> <REMOTE>/<BRANCH NAME> Verify that you are using the intended data associated with the code you are testing: If the data format has changed in an incompatible way, modify /etc/opt/topology/config-itb.py : Backup the ITB configuration file: # cd /etc/opt/topology # cp -p config-itb.py { ,.bak } Change the TOPOLOGY_DATA_DIR and/or CONTACT_DATA_DIR lines to point to a new directories so the previous data does not get overwritten with incompatible data. If you need to use a different branch for the data, switch to it: Check the branch of TOPOLOGY_DATA_DIR from /etc/opt/topology/config-itb.py # cd <TOPOLOGY_DATA_DIR> # git fetch --all # git status Note the previous branch, you will need this later If the target remote is not configured, add it Check out the target branch: # git checkout -b <BRANCH NAME> <REMOTE>/<BRANCH NAME> Pull any upstream changes to ensure that your branch is up to date: # git pull For updates to the webhook app, follow the above instructions for the ITB webhook instance under /opt/topology-itb-webhook and its corresponding config file, /etc/opt/topology/config-itb-webhook.py . Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log .","title":"Testing changes on the ITB instance"},{"location":"services/topology/#reverting-changes","text":"Switch /opt/topology-itb to the previous branch: # cd /opt/topology-itb # git checkout <BRANCH> For updates to the webhook app, switch /opt/topology-itb-webhook to the previous master: # cd /opt/topology-itb-webhook # git checkout <BRANCH> If you made config changes to /etc/opt/topology/config-itb.py or config-itb-webhook.py , restore the backup. If you checked out a different branch for data, revert it back to the old branch. Restart httpd : # systemctl restart httpd Test the web interface at https://topology-itb.opensciencegrid.org .","title":"Reverting changes"},{"location":"services/topology/#updating-the-production-instance","text":"Updating the production instance is similar to updating ITB instance. Update master on the Git clone at /opt/topology : # cd /opt/topology # git pull origin master For updates to the webhook app, update master on the Git clone at /opt/topology-webhook : # cd /opt/topology-webhook # git pull origin master Make config changes to /etc/opt/topology/config-production.py and/or config-production-webhook.py if necessary. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org . Errors and output are in /var/log/httpd/error_log .","title":"Updating the production instance"},{"location":"services/topology/#reverting-changes_1","text":"Switch /opt/topology to the previous master: # cd /opt/topology # ## (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> For updates to the webhook app, switch /opt/topology-webhook to the previous master: # cd /opt/topology-webhook ### (use `git reflog` to find the previous commit that was used) # git reset --hard <COMMIT> If you made config changes to /etc/opt/topology/config-production.py or config-production-webhook.py , revert them. Restart httpd : # systemctl restart httpd Test the web interface at https://topology.opensciencegrid.org .","title":"Reverting changes"},{"location":"troubleshooting/repository-scripts/","text":"Troubleshooting Guide for Yum Repository Scripts The repo.opensciencegrid.org and repo-itb.opensciencegrid.org hosts contain the OSG Yum software repositories plus related services and tools. In particular, the mash software is used to download RPMs from where they are built (at the University of Wisconsin\u2013Madison), and there are some associated scripts to configure and invoke mash periodically. Use this guide to monitor the mash system for problems and to perform basic troubleshooting when such problems arise. Monitoring To monitor the repository hosts for proper mash operation, do the following steps on each host: ssh to repo.opensciencegrid.org and cd into /var/log/repo to view logs from mash updates Examine the \u201cLast modified\u201d timestamp of all of the update_repo.*.log files If the timestamps are all less than 2 hours old, life is good and you can skip the remaining steps below Otherwise, examine the \u201cLast modified\u201d timestamp of the update_all_repos.err file If the update_all_repos.err timestamp is current, there may be a mash process that is hung; see the Troubleshooting steps below If all timestamps are more than 6 hours old, something may be wrong with cron or its mash entries: Verify that cron is running and that the cron entries for mash are still present; if not, try to restore things Otherwise, create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group Troubleshooting and Mitigation Identifying and fixing a hung mash process If a mash update process hangs, all future invocations from cron of the mash scripts will exit without taking action because of the hung process. Thus, it is important to identify and remove any hung processes so that future updates can proceed. Use the procedure below to remove any hung mash processes; doing so is safe in that it will not adversely affect the Yum repositories being served from the host. In the listing of log files (see above), view the file =update_all_repos.err= In the error log file, look for messages such as: Wed Jan 20 18:10:02 UTC 2016: **Can't acquire lock, is update_all_repos.sh already running?** This message indicates that the most recent update attempt quit early due to the presence of a lock file, most likely from a hung mash process. Look for mash processes: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND 24551 24549 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o 24552 24551 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o If there are mash processes that started on a previous date or more than 2 hours ago, it is best to remove their corresponding process groups (PGID above): root@host # kill -TERM -23455 Then verify that the old processes are gone using the same ps command as above: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND If any part of this process does not look or work as expected: Create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Troubleshooting Guide for Yum Repository Scripts"},{"location":"troubleshooting/repository-scripts/#troubleshooting-guide-for-yum-repository-scripts","text":"The repo.opensciencegrid.org and repo-itb.opensciencegrid.org hosts contain the OSG Yum software repositories plus related services and tools. In particular, the mash software is used to download RPMs from where they are built (at the University of Wisconsin\u2013Madison), and there are some associated scripts to configure and invoke mash periodically. Use this guide to monitor the mash system for problems and to perform basic troubleshooting when such problems arise.","title":"Troubleshooting Guide for Yum Repository Scripts"},{"location":"troubleshooting/repository-scripts/#monitoring","text":"To monitor the repository hosts for proper mash operation, do the following steps on each host: ssh to repo.opensciencegrid.org and cd into /var/log/repo to view logs from mash updates Examine the \u201cLast modified\u201d timestamp of all of the update_repo.*.log files If the timestamps are all less than 2 hours old, life is good and you can skip the remaining steps below Otherwise, examine the \u201cLast modified\u201d timestamp of the update_all_repos.err file If the update_all_repos.err timestamp is current, there may be a mash process that is hung; see the Troubleshooting steps below If all timestamps are more than 6 hours old, something may be wrong with cron or its mash entries: Verify that cron is running and that the cron entries for mash are still present; if not, try to restore things Otherwise, create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Monitoring"},{"location":"troubleshooting/repository-scripts/#troubleshooting-and-mitigation","text":"","title":"Troubleshooting and Mitigation"},{"location":"troubleshooting/repository-scripts/#identifying-and-fixing-a-hung-mash-process","text":"If a mash update process hangs, all future invocations from cron of the mash scripts will exit without taking action because of the hung process. Thus, it is important to identify and remove any hung processes so that future updates can proceed. Use the procedure below to remove any hung mash processes; doing so is safe in that it will not adversely affect the Yum repositories being served from the host. In the listing of log files (see above), view the file =update_all_repos.err= In the error log file, look for messages such as: Wed Jan 20 18:10:02 UTC 2016: **Can't acquire lock, is update_all_repos.sh already running?** This message indicates that the most recent update attempt quit early due to the presence of a lock file, most likely from a hung mash process. Look for mash processes: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND 24551 24549 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o 24552 24551 23455 Jan 15 /usr/bin/python /usr/bin/mash osg-3.1-el5-release -o If there are mash processes that started on a previous date or more than 2 hours ago, it is best to remove their corresponding process groups (PGID above): root@host # kill -TERM -23455 Then verify that the old processes are gone using the same ps command as above: root@host # ps -C mash -o pid,ppid,pgid,start,command PID PPID PGID STARTED COMMAND If any part of this process does not look or work as expected: Create a Freshdesk ticket with a subject like \u201cRepo update logs are too old on \u201d and with relevant details in the body Assign the ticket to the \u201cSoftware\u201d group","title":"Identifying and fixing a hung mash process"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index d9bfcb38..f34fd5e5 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1,123 +1,123 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url>
      <loc>https://osg-htc.org/operations/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/install-gwms-factory/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/topology/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/topology-contacts-data/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/finalize-cache-registration/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/sending-announcements/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/gracc-corrections/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/hosted-ce-definitions/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/ce-monitoring-dashboards/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/troubleshooting/repository-scripts/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/services/adding-external-cvmfs-repos/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/general/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/access-point/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/collector/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/gracc/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/gwms-factory/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/gwms-frontend/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/htcss-central-manager/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/hosted-ce/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/message-broker/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/oasis/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/osdf-core/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/osdf-cache/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/osdf-origin/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/perfsonar/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/software-repo/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/topology/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/web-pages/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/SLA/xdlogin/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/operations/external-oasis-repos/</loc>
-     <lastmod>2024-09-06</lastmod>
+     <lastmod>2024-09-13</lastmod>
      <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index e17dbb341c20c79de635d245d625d4d8785f25ac..f101e3fb99d8d2613f42467017c17b22c34be4f3 100644
GIT binary patch
delta 493
zcmV<J0TTZH1O5XCABzYGK568U2OWRA3kMdZS#jaS4S0a5J#j1?Pw==+%hNLsW#P&J
zX>y4?8B0IS=Rf_+bxykr7LJsU-Q)hy?Jzs?F^6OK^~*=|u6r3C`g2K)btlf5kKI)3
z^4{8YUH2R-&am*FO0euEIk3GO?P34i4ejpmuxn4B$PMi>%WdLdrs}qfv|xXMDhO<Q
z+nj#;1J{}S@N{^3YYxxm@r``G+wLxIf*f!x_<isY)2+9ESvH^EHiCb|U@VvDkZsId
zAte*mnav2UQn{&vth7*~pcl+I%2k5L#zO^lm}AV4;tw<qq=Zeu5V`*HQpd`eTX!1B
zna<hajJfJ-n^3?x$$Ba1bT)tXL|oAuhsLAiKkB3uyqN@!O3S$io=ybCs{yTr&ZmSt
z5mh0y1sCFmdo%1xTojfj_^p5NTCqsm8+VyUHXFDW*3X|l*nnBiNfVN^G%}loVy>FF
z9HoS9Gpov9n}Di-6e^fMTtx(eY&K0`VwP%mvR|bS%?ud=sw%P96{cMYYL*cpfikB;
z-*knGfTOArPwdC0kx(C0)4}SfL<EJHHDSqVWXhnb&Y8x#LcvYbXadvro7$LGJefu2
jm8!vT^%5pMmHsM~w+Z^6rSik}VyweY=fq;`oe}^5neqEc

delta 493
zcmV<J0TTZH1O5XCABzYG@gLif2OWRQ3LzGyS#jaS4R`?8o;cPzp5U1_El<xll!Yq?
zq{$`nWGwwOpa1mF*E#PlP$^+Kc5n8FZU@C<NF^S-Z(l!|_ucdG(4T82;GH-ZI(8HD
z^1<45UH6n2X1CBDYqaDiIgq^@?P34a4ejpmuxn4B(GTr1$!(-)CidG!T%dnC76h`r
zZBD=a9r27|csxA5HHRm2cqgClw!4d;T!}DN_&x-e@~yXjSvH^EHiCb|AgGt*fov+o
zE@u<hnat=sW4)<^WLz*~)C*=j7M9?V367mQ%rO<0(+@D-NeP<@F%kdrQpZYAT6Y>(
z3Z9FH84Bxbn^>LqlJ#1_>ui7Q35g*X4<>-*Kk1|tycq=!Ys-0XG@a1ZpazsHo=+KQ
zLS!Md6&9q1do$!~S`?P13tRudm7z-88-JNcG8?!S)-Rtw+6YC?Srd}9G!U5uQ(;Y9
z9&?7anN{VlO+ZyZ3Kh&Bt||gSHk&3eF-x_3*{{-vW_HCzP*q~DD@<Jz)GQ-H0%cAW
zUvz~F9VJyG9x04XBY{4sri0Z{i3p0RXu^`yNLZYzIwu@?brm*EqX|sgZ)#&&;baz>
jSE>fXHAtBBRQjt_-X`dOmdX#?i-3optaCR_oe}^5RBzwt