By default Episerver Find indexes content before it is published. Although it is easy to avoid unpublished content from being served as search results, potentially secret information not yet available for the public is sent to the indexing service outside of the office. Therefore we decided not to allow documents being indexed before publication. We also needed the possibility to occasionally exclude pages and files from being indexed. Another demand was to exclude files from the search results if the files weren’t linked.

I have seen two solutions approaching similar problems, written by Henrik Lindström and Erik Herlitz. What my solution adds is the ability to index files linked from nested blocks.

Content types that implements the ISearchable interface, as shown below, allow an editor to optionally exclude content from being indexed.


public interface ISearchable
{
  bool NotSearchable { get; set; }
}
    
[ContentType]
public class GenericMedia : MediaData, ISearchable
{
  public virtual bool NotSearchable { get; set; }
}

Callbacks are initialized in an InitializableModule. The callbacks will get executed either during full scheduled indexing or as a result of an event triggered by the CMS, such as a content being saved or published. To avoid endless recursion by cyclic references, there is a maximum depth (level) were the search for a containing page is terminated.


[InitializableModule]
[ModuleDependency(typeof(EPiServer.Web.InitializationModule))]
public class FindIndexFiltering : IInitializableModule
{
  private readonly List AcceptedFileExtensions = 
    new List() { "doc", "docx", "ppt", "pptx", "pdf", "xls", "xlsx" };

  public void Initialize(InitializationEngine context)
  {
    ContentIndexer.Instance.Conventions.ForInstancesOf().ShouldIndex(x => false);
    ContentIndexer.Instance.Conventions.ForInstancesOf()
      .ShouldIndex(ShouldIndexPagedata);
    ContentIndexer.Instance.Conventions.ForInstancesOf()
      .ShouldIndex(ShouldIndexContentMedia);
  }

  private bool IndexContainingPage(ContentReference cr, string language, int level)
  {
    if (level > 8)
    {
      return false;
    }

    var Repo = ServiceLocator.Current.GetInstance();
    IContent Cont;
    Repo.TryGet(cr,  out Cont);
    IVersionable Iv = Cont as IVersionable;

    if (Iv != null && !((Iv.StartPublish < DateTime.Now || Iv.StartPublish == null) && 
      (Iv.StopPublish > DateTime.Now || Iv.StopPublish == null)))
    {
      return false;
    }

    var Page = Cont as PageData;
    if (Page != null)
    {
      var i = Cont as ISearchable;
      if (i != null && i.NotSearchable)
      {
        return false;
      }
      return true;
    }

    var References = Repo.GetReferencesToContent(cr, false);
    foreach (var Ref in References)
    {
      if ((Ref.ReferencedLanguage == null || Ref.ReferencedLanguage.Name == Ref.OwnerLanguage.Name) && 
        IndexContainingPage(Ref.OwnerID, Ref.OwnerLanguage.Name, level + 1))
      {
        return true;
      }
    }
    return false;
  }

  private bool ShouldIndexContentMedia(IContentMedia file)
  {
    if (!AcceptedFileExtensions.Contains(file.SearchFileExtension().ToLowerInvariant()))
    {
      return false;
    }

    var IS = file as ISearchable;
    var NotSearchable = IS != null && IS.NotSearchable;
    if (!NotSearchable && IndexContainingPage(file.ContentLink, null, 0))
    {
      return true;
    }

    IEnumerable Result;
    ContentIndexer.Instance.TryDelete(file, out Result);
    return false;
  }

  private bool ShouldIndexPagedata(PageData page)
  {
    var IS = page as ISearchable;
    var NotSearchable = IS != null && IS.NotSearchable;
    if (!NotSearchable && page.CheckPublishedStatus(PagePublishedStatus.Published))
    {
      return true;
    }

    IEnumerable Result;
    ContentIndexer.Instance.TryDelete(page, out Result);
    return false;
  }

  public void Uninitialize(InitializationEngine context)
  {
  }
}

A page or a block containing a link to a file will trigger the indexing callback and cause the file to be indexed whenever the page or block is changed (saved, published etc.). However, a page or a block containing another block (e.g. via content area) with a link to a file will not trigger the indexing event of the file when changed. In order to have the file indexed in such a scenario, the block linking to the file, or the file itself (necessary if it is not a page file), has to be “touched” (e.g. by checking “Update modified date”). Of course, one can always wait for the full index job to run.