VintaSoft Imaging .NET SDK 14.0: Documentation for .NET developer
Vintasoft.Imaging.Text Namespace / TextRegion Class / GetWords Methods / GetWords(TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate) Method
Syntax Example Requirements SeeAlso
In This Topic
    GetWords(TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate) Method (TextRegion)
    In This Topic
    Returns the words of this text region.
    Syntax

    Parameters

    wordCharacterPredicate
    The word character predicate that determinates allowable characters in words.
    wordDelimiterPredicate
    The word delimiter predicate that determinates word delimiters.
    whiteSpaceCharacterPredicate
    The white space character predicate that determinates white space characters.

    Return Value

    An array of TextRegion, which defines words.
    Example

    This C#/VB.NET code shows how to extract only numbers from PDF page.

    
    ''' <summary>
    ''' Returns the numbers only from PDF page.
    ''' </summary>
    ''' <param name="page">PDF page.</param>
    ''' <returns>Numbers from PDF page.</returns>
    Public Shared Function GetOnlyNumbersFromPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As String
        ' get words of the page
        Dim words As Vintasoft.Imaging.Text.TextRegion() = page.TextRegion.GetWords(AddressOf WordCharacterPredicate, AddressOf WordDelimiterPredicate, AddressOf WhiteSpaceCharacterPredicate)
    
        Dim result As New System.Text.StringBuilder()
        For Each word As Vintasoft.Imaging.Text.TextRegion In words
            result.AppendLine(word.TextContent)
        Next
    
        Return result.ToString()
    End Function
    
    ''' <summary>
    ''' The word character predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WordCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' get the character to process
        Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
    
        ' if character is a number
        If System.[Char].IsNumber(processingCharacter) Then
            Return True
        ' if character can be categorized as a punctuation symbol
        ElseIf System.[Char].IsPunctuation(processingCharacter) Then
            Dim previousCharacter As Char = " "C
            If symbolIndex > 0 Then
                previousCharacter = lineSymbols(symbolIndex - 1).TextSymbol.Symbol
            End If
    
            Dim nextCharacter As Char = " "C
            If symbolIndex < lineSymbols.Length - 1 Then
                nextCharacter = lineSymbols(symbolIndex + 1).TextSymbol.Symbol
            End If
    
            ' if previous and next character is a number
            If System.[Char].IsNumber(previousCharacter) AndAlso System.[Char].IsNumber(nextCharacter) Then
                Return True
            End If
        End If
    
        Return False
    End Function
    
    ''' <summary>
    ''' The word delimiter predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WordDelimiterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' exclude the punctuation marks from the text extraction
        Return False
    End Function
    
    ''' <summary>
    ''' The white space character predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WhiteSpaceCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' get the character to process
        Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
        ' check that character can be categorized as white space
        Return System.[Char].IsWhiteSpace(processingCharacter)
    End Function
    
    
    
    /// <summary>
    /// Returns the numbers only from PDF page.
    /// </summary>
    /// <param name="page">PDF page.</param>
    /// <returns>Numbers from PDF page.</returns>
    public static string GetOnlyNumbersFromPdfPage(Vintasoft.Imaging.Pdf.Tree.PdfPage page)
    {
        // get words of the page
        Vintasoft.Imaging.Text.TextRegion[] words = page.TextRegion.GetWords(
            WordCharacterPredicate,
            WordDelimiterPredicate,
            WhiteSpaceCharacterPredicate);
    
        System.Text.StringBuilder result = new System.Text.StringBuilder();
        foreach (Vintasoft.Imaging.Text.TextRegion word in words)
        {
            result.AppendLine(word.TextContent);
        }
    
        return result.ToString();
    }
    
    /// <summary>
    /// The word character predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WordCharacterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // get the character to process
        char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
        
        // if character is a number
        if (System.Char.IsNumber(processingCharacter))
            return true;
        // if character can be categorized as a punctuation symbol
        else if (System.Char.IsPunctuation(processingCharacter))
        {
            char previousCharacter = ' ';
            if (symbolIndex > 0)
                previousCharacter = lineSymbols[symbolIndex - 1].TextSymbol.Symbol;
    
            char nextCharacter = ' ';
            if (symbolIndex < lineSymbols.Length - 1)
                nextCharacter = lineSymbols[symbolIndex + 1].TextSymbol.Symbol;
            
            // if previous and next character is a number
            if (System.Char.IsNumber(previousCharacter) && System.Char.IsNumber(nextCharacter))
                return true;
        }
    
        return false;
    }
    
    /// <summary>
    /// The word delimiter predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WordDelimiterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // exclude the punctuation marks from the text extraction
        return false;
    }
    
    /// <summary>
    /// The white space character predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WhiteSpaceCharacterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // get the character to process
        char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
        // check that character can be categorized as white space
        return System.Char.IsWhiteSpace(processingCharacter);
    }
    
    

    Requirements

    Target Platforms: .NET9; .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5

    See Also