Automatically restart the failed workflow instances

Recently I came across a situation in which customer had email enabled a document library and he had setup a custom workflow to start whenever any new item is added in the library. This setup was working as expected but when end-users send some higher number of emails at the same time (lets say 20-30 emails in a min) then we observed that a few workflow instances were failed with the status “Error Occurred” as shown below.

 image

While we investigate the internals of the workflow runtime and this behavior, we decided to implement a workaround so end-users can continue working. We planned for a custom timer job which can run periodically, scan all the failed instances and restart them.

 

Timer Job

Following is the code which we run in the timer job (for testing purpose you can put this code in a windows forms application and check, I have kept the commented message boxes in the code). We make a periodic entry in “Tasks” list just to verify that our timer job is running fine, you may disable this code.

 using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.SharePoint;
using Microsoft.SharePoint.Workflow;
using Microsoft.SharePoint.Administration;

namespace WFRestart
{
    public class WFRestart : SPJobDefinition
    {
        public WFRestart()
            : base()
        {
        }

        public WFRestart(string jobName, SPService service, SPServer server, SPJobLockType targetType)
            : base(jobName, service, server, targetType)
        {
        }

        public WFRestart(string jobName, SPWebApplication webApplication)
            : base(jobName, webApplication, null, SPJobLockType.ContentDatabase)
        {
            this.Title = "WFRestart";
        }

        public override void Execute(Guid contentDbId)
        {
            // get a reference to the current site collection's content database
            SPWebApplication webApplication = this.Parent as SPWebApplication;
            SPContentDatabase contentDb = webApplication.ContentDatabases[contentDbId];

            // get a reference to the "Tasks" list
            SPList taskList = contentDb.Sites[0].RootWeb.Lists["Tasks"];

            // create a new task and set the Title to the current day/time
            SPListItem newTask = taskList.Items.Add();
            newTask["Title"] = DateTime.Now.ToString();
            newTask.Update();

            // call this function to restart all the failed instances (e.g. error occurred)
            // you can make this code generic by passing the input parameters using the sharepoint object model
            // for configuration purpose create a list by name 'WFRestartConfig', _
            // put these values in a list item and retrive them to pass to this function
            //WorkflowRestart(site name, list name, workflow Association name)
            WorkflowRestart("https://teja19141622:100", "MyLibrary", "CustomWorkflow");

        }

        private void WorkflowRestart(string strSite, string strList, string strWFAssociation)
        {
            //string strSite = textBox1.Text;
            //string strList = textBox2.Text;
            //string strWFAssociation = textBox3.Text;
            Guid wfAssocID = Guid.Empty;

            try
            {
                using (SPSite site = new SPSite(strSite))
                {
                    using (SPWeb web = site.OpenWeb())
                    {
                        SPList list = web.Lists[strList];
                        foreach (SPWorkflowAssociation wfAssoc in list.WorkflowAssociations)
                        {
                            if (wfAssoc.InternalName.Equals(strWFAssociation))
                            {
                                wfAssocID = wfAssoc.Id;

                                SPListItemCollection itmCol = list.Items;
                                foreach (SPListItem item in itmCol)
                                {
                                    SPWorkflowCollection wfCol = item.Workflows;
                                    for (int i = 0; i < wfCol.Count; i++)
                                    {
                                        if (wfCol[i].AssociationId.Equals(wfAssocID))
                                        {
                                            //MessageBox.Show(wfCol[i].InternalState.ToString());
                                            if ((wfCol[i].InternalState.ToString() == "Running, Faulting, Terminated"))
                                            {
                                                //MessageBox.Show("Canceling WF...");
                                                SPWorkflowManager.CancelWorkflow(wfCol[i]);
                                                //MessageBox.Show("WF Cancelled...");

                                                //MessageBox.Show("Restarting WF...");
                                                site.WorkflowManager.StartWorkflow(item, wfAssoc, wfAssoc.AssociationData, true);
                                                //MessageBox.Show("WF Restarted...");
                                                break;
                                            }
                                        }
                                    }
                                    wfCol = null;
                                }
                            }
                        }
                    }
                }
                //MessageBox.Show("Done!");
            }
            catch (Exception ex)
            {
                //MessageBox.Show(ex.Message);
            }
            finally { }
        }

    }
}

Timer Job Registration

We can use a feature & it’s events to register this timer job.

Feature:  WFRestartFeature

 <Feature   
  Id="220DE1B8-B38E-42af-B917-85D99620D0F2" 
  Title="WFRestartFeature" 
  Scope="Web" 
  ReceiverAssembly="WFRestart, Version=1.0.0.0, Culture=neutral, PublicKeyToken=972558cd6f822bd6" 
  ReceiverClass="WFRestart.WFRestartInstall"  
  Hidden="False" 
  xmlns="https://schemas.microsoft.com/sharepoint/">
</Feature>

Feature Receiver:

 using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.SharePoint;
using Microsoft.SharePoint.Administration;

namespace WFRestart
{
    class WFRestartInstall : SPFeatureReceiver 
    {
        const string JOB_NAME = "WFRestart";
         public override void FeatureInstalled (SPFeatureReceiverProperties properties) {
       }
     
        public override void FeatureUninstalling (SPFeatureReceiverProperties properties) {
        }
     
        public override void FeatureActivated (SPFeatureReceiverProperties properties) {
          SPSite site = properties.Feature.Parent as SPSite;

          // this is for testing purpose, you can put your site URL here under which you want to register the timer job, 
          // you can use the site object as mentioned above 
          SPSite jobSite = new SPSite("https://teja19141622:100");

          // make sure the job isn't already registered
          foreach (SPJobDefinition job in jobSite.WebApplication.JobDefinitions)
          {
              if (job.Name == JOB_NAME)
              job.Delete();
          }
     
          // install the job
          WFRestart wfRestartJob = new WFRestart(JOB_NAME, jobSite.WebApplication);
     
          SPMinuteSchedule schedule = new SPMinuteSchedule();
          schedule.BeginSecond = 0;
          schedule.EndSecond = 59;
          schedule.Interval = 5;
          wfRestartJob.Schedule = schedule;

          wfRestartJob.Update();

          jobSite.Dispose();
        }

        public override void FeatureDeactivating(SPFeatureReceiverProperties properties)
        {
            SPSite site = properties.Feature.Parent as SPSite;

            // this is for testing purpose, you can put your site URL here under which you want to unregister the timer job, 
            // you can use the site object as mentioned above 
            SPSite jobSite = new SPSite("https://teja19141622:100");

            // delete the job
            foreach (SPJobDefinition job in jobSite.WebApplication.JobDefinitions)
            {
                if (job.Name == JOB_NAME)
                    job.Delete();
            }
            jobSite.Dispose();
        }
    }
}

 

All the failed workflow instances will automatically restart within 5 min.

image