(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[9806],{83607:function(e,s,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/docs/community-docs/watcloud/maintenance-manual",function(){return n(74558)}])},74558:function(e,s,n){"use strict";n.r(s),n.d(s,{__toc:function(){return t}});var r=n(85893),l=n(31379),a=n(82643);let t=[{depth:2,value:"General procedure",id:"general-procedure"},{depth:2,value:"SLURM",id:"slurm"},{depth:3,value:"Cluster overview",id:"cluster-overview"},{depth:3,value:"Performing maintenance on a node",id:"performing-maintenance-on-a-node"},{depth:4,value:"Creating a reservation",id:"creating-a-reservation"},{depth:4,value:"Starting maintenance",id:"starting-maintenance"},{depth:4,value:"Taking a node out of maintenance mode",id:"taking-a-node-out-of-maintenance-mode"}];function _createMdxContent(e){let s=Object.assign({h1:"h1",p:"p",h2:"h2",ol:"ol",li:"li",strong:"strong",a:"a",h3:"h3",pre:"pre",code:"code",span:"span",h4:"h4",sup:"sup",section:"section"},(0,a.a)(),e.components);return(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(s.h1,{children:"WATcloud Maintenance Manual"}),"\n",(0,r.jsx)(s.p,{children:"This manual outlines the maintenance procedures for various components of WATcloud."}),"\n",(0,r.jsx)(s.h2,{id:"general-procedure",children:"General procedure"}),"\n",(0,r.jsxs)(s.ol,{children:["\n",(0,r.jsxs)(s.li,{children:[(0,r.jsx)(s.strong,{children:"Plan the maintenance"}),": Prepare a plan for the maintenance, including the start time, end time, and the steps to be taken during the maintenance. Identify the components and services that will be affected. Try to minimize the impact on users by using strategies like rolling updates."]}),"\n",(0,r.jsxs)(s.li,{children:[(0,r.jsx)(s.strong,{children:"Notify users"}),": If the maintenance will affect users, ",(0,r.jsx)(s.a,{href:"https://github.com/WATonomous/infrastructure-support/discussions",children:"notify them in advance"}),". Make sure to give users plenty of time to prepare for the maintenance. In general, one week's notice is recommended."]}),"\n",(0,r.jsxs)(s.li,{children:[(0,r.jsx)(s.strong,{children:"Perform the maintenance"}),": Follow the steps outlined in the maintenance plan. If the maintenance runs over the scheduled end time, notify users of the delay."]}),"\n",(0,r.jsxs)(s.li,{children:[(0,r.jsx)(s.strong,{children:"Verify the maintenance"}),": After the maintenance is complete, verify that all components are working as expected (including CI pipelines). If there are any issues, address them immediately. Use ",(0,r.jsx)(s.a,{href:"./observability",children:"observability tools"})," to monitor the health of the system."]}),"\n",(0,r.jsxs)(s.li,{children:[(0,r.jsx)(s.strong,{children:"Notify users"}),": Once the maintenance is complete, update the maintenance announcement to indicate that the maintenance is complete. If there were any issues during the maintenance, provide details on what happened and how it was resolved."]}),"\n"]}),"\n",(0,r.jsx)(s.h2,{id:"slurm",children:"SLURM"}),"\n",(0,r.jsx)(s.p,{children:"This section outlines the maintenance procedures for the SLURM cluster."}),"\n",(0,r.jsx)(s.h3,{id:"cluster-overview",children:"Cluster overview"}),"\n",(0,r.jsx)(s.p,{children:"To get a general overview of the health of the SLURM cluster, you can run:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"sinfo"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"--long"})]})})}),"\n",(0,r.jsx)(s.p,{children:"Example output:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"text","data-theme":"default",children:(0,r.jsxs)(s.code,{"data-language":"text","data-theme":"default",children:[(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"Thu Apr 18 17:16:26 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"PARTITION AVAIL  TIMELIMIT   JOB_SIZE ROOT OVERSUBS     GROUPS  NODES       STATE RESERVATION NODELIST"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1     drained             tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1       mixed             thor-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      3        idle             trpro-slurm[1-2],wato2-slurm1"})})]})}),"\n",(0,r.jsxs)(s.p,{children:["In the output above, ",(0,r.jsx)(s.code,{children:"tr-slurm1"})," is in the ",(0,r.jsx)(s.code,{children:"drained"})," state, which means it is not available for running jobs.\n",(0,r.jsx)(s.code,{children:"thor-slurm1"})," is in the ",(0,r.jsx)(s.code,{children:"mix"})," state, which means some jobs are running on it.\nAll other nodes are in the ",(0,r.jsx)(s.code,{children:"idle"})," state, which means there are no jobs running on them."]}),"\n",(0,r.jsx)(s.p,{children:"To get a detailed overview of nodes in the cluster, you can run:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"show"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"node"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" [NODE_NAME]"})]})})}),"\n",(0,r.jsxs)(s.p,{children:["The optional ",(0,r.jsx)(s.code,{children:"NODE_NAME"})," argument can be used to restrict the output to a specific node."]}),"\n",(0,r.jsx)(s.p,{children:"Example output:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"text","data-theme":"default",children:(0,r.jsxs)(s.code,{"data-language":"text","data-theme":"default",children:[(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"> scontrol show node tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"NodeName=tr-slurm1 Arch=x86_64 CoresPerSocket=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CPUAlloc=0 CPUEfctv=58 CPUTot=60 CPULoad=0.01"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AvailableFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ActiveFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Gres=gpu:grid_p40:1(S:0),shard:grid_p40:8K(S:0),tmpdisk:100K"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   NodeAddr=tr-slurm1.ts.watonomous.ca NodeHostName=tr-slurm1 Version=23.11.4"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   OS=Linux 5.15.0-100-generic #110-Ubuntu SMP Wed Feb 7 13:27:48 UTC 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   RealMemory=39140 AllocMem=0 FreeMem=29723 Sockets=60 Boards=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CoreSpecCount=2 CPUSpecList=58-59 MemSpecLimit=2048"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   State=IDLE+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Partitions=compute"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   BootTime=2024-03-17T03:32:45 SlurmdStartTime=2024-04-13T20:55:32"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   LastBusyTime=2024-04-16T19:16:13 ResumeAfterTime=None"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CfgTRES=cpu=58,mem=39140M,billing=58,gres/gpu=1,gres/shard=8192,gres/tmpdisk=102400"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AllocTRES="})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CapWatts=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CurrentWatts=0 AveWatts=0"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ExtSensorsJoules=n/a ExtSensorsWatts=0 ExtSensorsTemp=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Reason=Performing maintenance on baremetal [root@2024-04-18T17:06:20] "})})]})}),"\n",(0,r.jsxs)(s.p,{children:["In the output above, we can see that the reason ",(0,r.jsx)(s.code,{children:"tr-slurm1"})," is in the ",(0,r.jsx)(s.code,{children:"drained"})," state (a.k.a. ",(0,r.jsx)(s.code,{children:"IDLE+DRAIN"}),") for reason ",(0,r.jsx)(s.code,{children:"Performing maintenance on baremetal"}),".\nThe ",(0,r.jsx)(s.code,{children:"Reason"})," field is an arbitrary user-specified string that can be set when performing actions on nodes."]}),"\n",(0,r.jsx)(s.h3,{id:"performing-maintenance-on-a-node",children:"Performing maintenance on a node"}),"\n",(0,r.jsx)(s.h4,{id:"creating-a-reservation",children:"Creating a reservation"}),"\n",(0,r.jsxs)(s.p,{children:["The first step to performing maintenance on a node is to create a reservation",(0,r.jsx)(s.sup,{children:(0,r.jsx)(s.a,{href:"#user-content-fn-slurm-reservation",id:"user-content-fnref-slurm-reservation","data-footnote-ref":!0,"aria-describedby":"footnote-label",children:"1"})}),".\nThis ensures that user-submitted jobs do not get dispatched to the target nodes if they cannot be\ncompleted before the maintenance window starts."]}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"create"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reservation"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"starttime="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:"<"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"START_TIM"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"E"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:">"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"duration="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:"<"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"DURATION_IN_MINUTE"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"S"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:">"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"user=root"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"flags=maint"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodes="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:"<"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"NODE_NAME"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"S"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:">"})]})})}),"\n",(0,r.jsxs)(s.p,{children:["The time zone for the ",(0,r.jsx)(s.code,{children:"starttime"})," argument is the local time zone where the command is run.\nTo see the local timezone, run ",(0,r.jsx)(s.code,{children:"timedatectl"}),"."]}),"\n",(0,r.jsx)(s.p,{children:"Here's an example:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"create"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reservation"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"starttime="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-constant)"},children:"2024"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"-04-30T21:00:00"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"duration="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-constant)"},children:"480"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"user=root"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"flags=maint"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodes=trpro-slurm1,trpro-slurm2"})]})})}),"\n",(0,r.jsx)(s.p,{children:"output:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"text","data-theme":"default",children:(0,r.jsx)(s.code,{"data-language":"text","data-theme":"default",children:(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"Reservation created: root_4"})})})}),"\n",(0,r.jsxs)(s.p,{children:["This command creates a reservation named ",(0,r.jsx)(s.code,{children:"root_4"}),". The reservation starts on\nApril 30, 2024, at 9:00 PM local time and lasts for\n8 hours (480 minutes) for the nodes ",(0,r.jsx)(s.code,{children:"trpro-slurm1"})," and ",(0,r.jsx)(s.code,{children:"trpro-slurm2"}),".\nDuring this time, only the root user can run jobs on these nodes.\nJobs submitted by other users will be queued until the reservation is over."]}),"\n",(0,r.jsx)(s.p,{children:"To see existing reservations, you can run:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"show"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reservation"})]})})}),"\n",(0,r.jsx)(s.p,{children:"To delete a resservation, you can run:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"delete"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reservation"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:"<"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"RESERVATION_NAM"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"E"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-keyword)"},children:">"})]})})}),"\n",(0,r.jsx)(s.h4,{id:"starting-maintenance",children:"Starting maintenance"}),"\n",(0,r.jsx)(s.p,{children:"Before performing maintenance on a node, you should drain the node to ensure no jobs are running on it and no new jobs are scheduled to run on it."}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"update"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodename="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"<NODE_NAME>"'}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"state=drain"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reason="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"Performing maintenance for reason X"'})]})})}),"\n",(0,r.jsx)(s.p,{children:"For example:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"update"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodename="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"tr-slurm1"'}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"state=drain"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"reason="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"Performing maintenance on baremetal"'})]})})}),"\n",(0,r.jsxs)(s.p,{children:["This will drain the node ",(0,r.jsx)(s.code,{children:"tr-slurm1"})," (prevent new jobs from running on it) and set the reason to ",(0,r.jsx)(s.code,{children:"Performing maintenance on baremetal"}),".\nIf there are no jobs running on the node, the node state becomes ",(0,r.jsx)(s.code,{children:"drained"})," (a.k.a. ",(0,r.jsx)(s.code,{children:"IDLE+DRAIN"})," in ",(0,r.jsx)(s.code,{children:"scontrol"}),").\nIf there are jobs running on the node, the node state becomes ",(0,r.jsx)(s.code,{children:"draining"})," (a.k.a. ",(0,r.jsx)(s.code,{children:"MIXED+DRAIN"})," in ",(0,r.jsx)(s.code,{children:"scontrol"}),").\nIn this case, SLURM will wait for the jobs to finish before transitioning the node to the ",(0,r.jsx)(s.code,{children:"drained"})," state."]}),"\n",(0,r.jsxs)(s.p,{children:["Example output from when a node is in the ",(0,r.jsx)(s.code,{children:"draining"})," state:"]}),"\n",(0,r.jsx)(s.pre,{"data-language":"text","data-theme":"default",children:(0,r.jsxs)(s.code,{"data-language":"text","data-theme":"default",children:[(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"> sinfo --long"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"Thu Apr 18 17:17:35 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"PARTITION AVAIL  TIMELIMIT   JOB_SIZE ROOT OVERSUBS     GROUPS  NODES       STATE RESERVATION NODELIST"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1    draining             tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1       mixed             thor-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      3        idle             trpro-slurm[1-2],wato2-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"}})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"> scontrol show node tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"NodeName=tr-slurm1 Arch=x86_64 CoresPerSocket=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CPUAlloc=1 CPUEfctv=58 CPUTot=60 CPULoad=0.00"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AvailableFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ActiveFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Gres=gpu:grid_p40:1(S:0),shard:grid_p40:8K(S:0),tmpdisk:100K"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   NodeAddr=tr-slurm1.ts.watonomous.ca NodeHostName=tr-slurm1 Version=23.11.4"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   OS=Linux 5.15.0-100-generic #110-Ubuntu SMP Wed Feb 7 13:27:48 UTC 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   RealMemory=39140 AllocMem=512 FreeMem=29688 Sockets=60 Boards=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CoreSpecCount=2 CPUSpecList=58-59 MemSpecLimit=2048"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   State=MIXED+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Partitions=compute"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   BootTime=2024-03-17T03:32:45 SlurmdStartTime=2024-04-13T20:55:32"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   LastBusyTime=2024-04-18T17:15:30 ResumeAfterTime=None"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CfgTRES=cpu=58,mem=39140M,billing=58,gres/gpu=1,gres/shard=8192,gres/tmpdisk=102400"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AllocTRES=cpu=1,mem=512M,gres/tmpdisk=300"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CapWatts=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CurrentWatts=0 AveWatts=0"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ExtSensorsJoules=n/a ExtSensorsWatts=0 ExtSensorsTemp=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Reason=Performing maintenance on baremetal [root@2024-04-18T17:16:01]"})})]})}),"\n",(0,r.jsxs)(s.p,{children:["After jobs finish running on the node, the node will transition to the ",(0,r.jsx)(s.code,{children:"drained"})," state:"]}),"\n",(0,r.jsx)(s.pre,{"data-language":"text","data-theme":"default",children:(0,r.jsxs)(s.code,{"data-language":"text","data-theme":"default",children:[(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"> sinfo --long"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"Thu Apr 18 17:22:07 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"PARTITION AVAIL  TIMELIMIT   JOB_SIZE ROOT OVERSUBS     GROUPS  NODES       STATE RESERVATION NODELIST"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1     drained             tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      1       mixed             thor-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"compute*     up 1-00:00:00 1-infinite   no       NO        all      3        idle             trpro-slurm[1-2],wato2-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"}})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"> scontrol show node tr-slurm1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"NodeName=tr-slurm1 Arch=x86_64 CoresPerSocket=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CPUAlloc=0 CPUEfctv=58 CPUTot=60 CPULoad=0.00"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AvailableFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ActiveFeatures=(null)"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Gres=gpu:grid_p40:1(S:0),shard:grid_p40:8K(S:0),tmpdisk:100K"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   NodeAddr=tr-slurm1.ts.watonomous.ca NodeHostName=tr-slurm1 Version=23.11.4"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   OS=Linux 5.15.0-100-generic #110-Ubuntu SMP Wed Feb 7 13:27:48 UTC 2024"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   RealMemory=39140 AllocMem=0 FreeMem=29688 Sockets=60 Boards=1"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CoreSpecCount=2 CPUSpecList=58-59 MemSpecLimit=2048"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   State=IDLE+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Partitions=compute"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   BootTime=2024-03-17T03:32:45 SlurmdStartTime=2024-04-13T20:55:32"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   LastBusyTime=2024-04-18T17:21:13 ResumeAfterTime=None"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CfgTRES=cpu=58,mem=39140M,billing=58,gres/gpu=1,gres/shard=8192,gres/tmpdisk=102400"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   AllocTRES="})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CapWatts=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   CurrentWatts=0 AveWatts=0"})}),"\n",(0,r.jsx)(s.span,{className:"line",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   ExtSensorsJoules=n/a ExtSensorsWatts=0 ExtSensorsTemp=n/a"})}),"\n",(0,r.jsx)(s.span,{className:"line highlighted",children:(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:"   Reason=Performing maintenance on baremetal [root@2024-04-18T17:16:01]"})})]})}),"\n",(0,r.jsxs)(s.p,{children:["Once the node is in the ",(0,r.jsx)(s.code,{children:"drained"})," state, you can perform maintenance on it."]}),"\n",(0,r.jsx)(s.h4,{id:"taking-a-node-out-of-maintenance-mode",children:"Taking a node out of maintenance mode"}),"\n",(0,r.jsx)(s.p,{children:"To take a node out of maintenance mode, you can run:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"update"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodename="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"<NODE_NAME>"'}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"state=resume"})]})})}),"\n",(0,r.jsx)(s.p,{children:"For example:"}),"\n",(0,r.jsx)(s.pre,{"data-language":"bash","data-theme":"default",hasCopyCode:!0,children:(0,r.jsx)(s.code,{"data-language":"bash","data-theme":"default",children:(0,r.jsxs)(s.span,{className:"line",children:[(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-function)"},children:"scontrol"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"update"}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"nodename="}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string-expression)"},children:'"tr-slurm1"'}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-color-text)"},children:" "}),(0,r.jsx)(s.span,{style:{color:"var(--shiki-token-string)"},children:"state=resume"})]})})}),"\n",(0,r.jsxs)(s.p,{children:["This will resume the node ",(0,r.jsx)(s.code,{children:"tr-slurm1"})," (allow new jobs to run on it) and clear the reason."]}),"\n",(0,r.jsxs)(s.p,{children:["Also remember to delete any unexpired reservations (as outlined in ",(0,r.jsx)(s.a,{href:"#creating-a-reservation",children:"Creating a reservation"}),")."]}),"\n","\n",(0,r.jsxs)(s.section,{"data-footnotes":!0,className:"footnotes",children:[(0,r.jsx)(s.h2,{className:"sr-only",id:"footnote-label",children:"Footnotes"}),"\n",(0,r.jsxs)(s.ol,{children:["\n",(0,r.jsxs)(s.li,{id:"user-content-fn-slurm-reservation",children:["\n",(0,r.jsxs)(s.p,{children:["The official documentation for reservations is at ",(0,r.jsx)(s.a,{href:"https://slurm.schedmd.com/reservations.html",children:"https://slurm.schedmd.com/reservations.html"})," ",(0,r.jsx)(s.a,{href:"#user-content-fnref-slurm-reservation","data-footnote-backref":!0,className:"data-footnote-backref","aria-label":"Back to content",children:"↩"})]}),"\n"]}),"\n"]}),"\n"]})]})}s.default=(0,l.j)({MDXContent:function(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{},{wrapper:s}=Object.assign({},(0,a.a)(),e.components);return s?(0,r.jsx)(s,{...e,children:(0,r.jsx)(_createMdxContent,{...e})}):_createMdxContent(e)},pageOpts:{filePath:"pages/docs/community-docs/watcloud/maintenance-manual.mdx",route:"/docs/community-docs/watcloud/maintenance-manual",timestamp:1726974952e3,title:"WATcloud Maintenance Manual",headings:t},pageNextRoute:"/docs/community-docs/watcloud/maintenance-manual"})}},function(e){e.O(0,[1379,9774,2888,179],function(){return e(e.s=83607)}),_N_E=e.O()}]);