Faster monkeys


Last time I showed how to simulate monkeys typing on typewriters, using letter frequencies based on input text, like Hamlet’s soliloquy.

The results were remarkably similar to the input text, but the output was relatively slow.

Below is a version that has the “Optimum” option that outputs much faster if the “-o” command line parameter is used.

On my machine, the slow version spits out one character every 1.2 seconds for a pattern length of 8.

With the “-o” option, creating the initial data structure takes a few seconds longer, but the text comes out almost a thousand times faster: 980 characters per second.

How can this be?

The slower version uses a single Dictionary<string,int>. The key is the length of a pattern, say “To Be or”. The value is the number of times that pattern exists in the original text. To generate text, if the prior pattern is “To Be o” then all the sum of all the values that start with “To Be o” are found, then a random one is chosen.

The faster version is essentially a dictionary of dictionaries: Dictionary<string, Dictionary<string, int>>, which precalculates the sums.

 

<code>

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
// Start Visual Studio
// File->New->Project->C#->Windows->Console Application
namespace Monkey
{
  public class Program
  {
    static string txtHamlet =
@"To be, or not to be--that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune
Or to take arms against a sea of troubles
And by opposing end them. To die, to sleep--
No more--and by a sleep to say we end
The heartache, and the thousand natural shocks
That flesh is heir to. 'Tis a consummation
Devoutly to be wished. To die, to sleep--
To sleep--perchance to dream: ay, there's the rub,
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil,
Must give us pause. There's the respect
That makes calamity of so long life.
For who would bear the whips and scorns of time,
Th' oppressor's wrong, the proud man's contumely
The pangs of despised love, the law's delay,
The insolence of office, and the spurns
That patient merit of th' unworthy takes,
When he himself might his quietus make
With a bare bodkin? Who would fardels bear,
To grunt and sweat under a weary life,
But that the dread of something after death,
The undiscovered country, from whose bourn
No traveller returns, puzzles the will,
And makes us rather bear those ills we have
Than fly to others that we know not of?
Thus conscience does make cowards of us all,
And thus the native hue of resolution
Is sicklied o'er with the pale cast of thought,
And enterprise of great pitch and moment
With this regard their currents turn awry
And lose the name of action. -- Soft you now,
The fair Ophelia! -- Nymph, in thy orisons
Be all my sins remembered.
";
    // Slow way use a single dictionary:
    Dictionary<string, int> _dictData;


    // fast way: use a dictionary of dictionaries
    /// <summary>
    /// An instance of cPriorData for each sequence of letters with length = PatternLength -1
    /// </summary>
    public class cPriorData
    {
      // the # of occurrences of the prior sequence.
      public int cnt;
      // for each single char "a", "b", how many follow the sequence
      // e.g. for text "to be or not to be", and prior = "no",
      // this dict will have one entry 't' == 1
      public Dictionary<char, int> _dictChars = new Dictionary<char, int>();
      public override string ToString()
      {
        return $"{cnt}";
      }
    }

    Dictionary<string, cPriorData> _dictCPriorData = new Dictionary<string, cPriorData>();



    Random _rand = new Random(1);
    int _maxLen = -1;
    int _lenPattern = 2;
    bool _optimum = false;
    string _txt = txtHamlet;

    static void Main(string[] args)
    {
      // create a new instance of "Program", minimizing
      // the use of statics and making it 
      // much more maintainable and testable
      var prog = new Program();
      prog.DoMain(args);
    }

    internal void DoMain(string[] args)
    {
      ProcessArgs(args);
      var sw = new Stopwatch();
      sw.Start();
      if (!_optimum)
      {
        FillDictionaryFromReadingText();
      }
      else
      {
        FillDictionaryFromReadingTextOptimum();
      }
      sw.Stop();
      Console.WriteLine($"Filled dictionary in {sw.Elapsed.TotalSeconds:n2} seconds");

      Console.WriteLine($"Txt len = {_txt.Length:n0} PatLen = {_lenPattern} DictSize = {_dictCPriorData.Count:n0}  Optimum = {_optimum}");
      sw.Restart();
      int nChars = 0;
      if (!_optimum)
      {
        nChars = GenerateRandomText();
      }
      else
      {
        nChars = GenerateRandomTextOptimum();
      }

      sw.Stop();
      if (nChars > 0)
      {
        Console.WriteLine($"Generated {nChars} characters at the rate of {(nChars / sw.Elapsed.TotalSeconds):n2} chars per second");
      }
    }

    private void ProcessArgs(string[] args)
    {
      for (int i = 0; i < args.Length; i++)
      {
        switch (args[i][0])
        {
          case '/':
          case '-':
            if (args[i].Length > 1)
            {
              switch (args[i][1])
              {
                case 'f':
                  if (++i < args.Length)
                  {
                    _txt = System.IO.File.ReadAllText(args[i]);
                    Console.WriteLine($"Filename {args[i]}");
                  }
                  break;
                case 'm':
                  if (++i < args.Length)
                  {
                    if (int.TryParse(args[i], out _maxLen))
                    {
                      Console.WriteLine($"Max len = {_maxLen}");
                    }
                  }
                  break;
                case 'l':
                  if (++i < args.Length)
                  {
                    if (int.TryParse(args[i], out _lenPattern))
                    {
                      Console.WriteLine($"Pat len = {_lenPattern}");
                    }
                  }
                  break;
                case 'o':
                  _optimum = true;
                  break;
              }
            }
            break;
        }
      }
      if (_maxLen > 0 && _txt.Length > _maxLen)
      {
        _txt = _txt.Substring(0, _maxLen - 1);
      }
      //_txt = "the quick brown fox jumps over the lazy dog";
      //_lenPattern = 2;
    }

    private void FillDictionaryFromReadingText()
    {
      _dictData = new Dictionary<string, int>();
      var txtLen = _txt.Length;
      var strPrior = " ";
      for (int i = 0; i < txtLen; i++)
      {
        var chrNew = char.ToLower(_txt[i]);
        strPrior += chrNew;
        if (strPrior.Length < _lenPattern)
        {
          continue;

        }
        if (_dictData.ContainsKey(strPrior))
        {
          _dictData[strPrior] += 1;
        }
        else
        {
          _dictData[strPrior] = 1;
        }
        // lop off first char
        strPrior = strPrior.Substring(1);
      }
    }

    private int GenerateRandomText()
    {
      var strPrior = string.Empty;
      int nChars;
      for (nChars = 0; nChars < 1000000 && !Console.KeyAvailable; nChars++)
      {
        if (string.IsNullOrEmpty(strPrior))
        {
          // we need some sort of initial prior string
          // so we'll take it from a random entry in the data
          int numSkip = _rand.Next(Math.Min(10000, _dictData.Count));
          if (_dictData.Count < 30)
          {
            // when testing with very few items
            numSkip = 0;
          }
          strPrior = _dictData
              .Skip(numSkip)
              // look for one that is preceded by " "
              .SkipWhile(
                  d => !d.Key.StartsWith(" "))
              .First()
              .Key
              .Substring(_lenPattern - 1);
          // we need to print the new prior string. 
          // indicates it's a restart by using Upper case
          Console.Write(strPrior.ToUpper());

        }
        // first we find the sum of all the items that start with strPrior
        int sum = _dictData
            .Where(
            d => d.Key.StartsWith(strPrior))
            .Sum(d => d.Value);
        if (sum == 0) // last sequence in text
        {
          strPrior = " ";
          continue;
        }
        // now we get a random number from 0 to the sum
        int targetRand = _rand.Next(sum + 1);
        var target = _dictData
            .Where(d => d.Key.StartsWith(strPrior))
            // we subtract each entry's value until we reach 0
            .Where(d => (targetRand -= d.Value) <= 0)
            .FirstOrDefault();
        var newchar = target.Key.Substring(_lenPattern - 1);
        strPrior = target.Key.Substring(1, _lenPattern - 1);
        Console.Write(newchar);
      }
      Console.WriteLine();
      return nChars;
    }


    private void FillDictionaryFromReadingTextOptimum()
    {
      var strPrior = " ";
      for (int i = 0; i < _txt.Length; i++)
      {
        // read a character from the text
        var chrNew = _txt[i];
        if (strPrior.Length < _lenPattern)
        {
          strPrior += chrNew;
          continue;
        }
        // we now have prior with length = patlen -1 and a new character
        cPriorData pData = null;
        // have we encountered this strPrior before?
        if (!_dictCPriorData.TryGetValue(strPrior, out pData))
        {
          pData = new cPriorData();
          _dictCPriorData[strPrior] = pData;
        }
        // increment the # of times we've seen it
        pData.cnt += 1;
        // for this strPrior, have we seen the chrNew before?
        if (pData._dictChars.ContainsKey(chrNew))
        {
          pData._dictChars[chrNew] += 1;
        }
        else
        {
          pData._dictChars[chrNew] = 1;
        }
        if (strPrior.Length > 0)
        {
          // remove 1st char
          strPrior = strPrior.Substring(1);
        }
        strPrior += chrNew;
      }
    }

    private int GenerateRandomTextOptimum()
    {
      var strPrior = string.Empty;
      int nChars;
      for (nChars = 0; nChars < 1000000 && !Console.KeyAvailable; nChars++)
      {
        if (string.IsNullOrEmpty(strPrior))
        {
          // we need some sort of initial prior string
          // so we'll take it from a random entry in the data
          int numSkip = _rand.Next(Math.Min(10000, _dictCPriorData.Count));
          if (_dictCPriorData.Count < 30)
          {
            // when testing with very few items
            numSkip = 0;
          }
          strPrior = _dictCPriorData
              .Skip(numSkip)
              // look for one that is preceded by " "
              .SkipWhile(
                  d => !d.Key.StartsWith(" "))
              .First()
              .Key;
          // we need to print the new prior string	
          // indicates it's a restart by using Upper case
          Console.Write(strPrior.ToUpper());
        }
        cPriorData pData = null;
        if (!_dictCPriorData.TryGetValue(strPrior, out pData))
        {
          // for some reason, the sequence we have isn't found. 
          // could be that the last sequence in the input text is unique
          // and because it's the end of the text, it's not in the data
          // reset to new random start
          strPrior = string.Empty;
          continue;
        }
        int targetRand = _rand.Next(pData.cnt);
        var target = pData._dictChars
            // we subtract each entry's value until we reach 0
            .Where(d => (targetRand -= d.Value) < 0)
            .FirstOrDefault();
        // remove 1st char from the strPrior and add the target char at the end
        strPrior = strPrior.Substring(1) + target.Key;
        Console.Write(target.Key);
      }
      Console.WriteLine();
      return nChars;
    }

  }
}

</code>

Comments (0)

Skip to main content