Examples >

Extract Bigrams, Trigrams, and Ngrams

/*
 * Copyright (c) 2006-2018 North Concepts Inc.  All rights reserved.
 * Proprietary and Confidential.  Use is subject to license terms.
 *
 * http://northconcepts.com/data-pipeline/licensing/
 *
 */
package com.northconcepts.datapipeline.examples.cookbook;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;

import com.northconcepts.datapipeline.core.DataReader;
import com.northconcepts.datapipeline.core.DataWriter;
import com.northconcepts.datapipeline.core.LimitReader;
import com.northconcepts.datapipeline.core.SequenceReader;
import com.northconcepts.datapipeline.core.SortingReader;
import com.northconcepts.datapipeline.csv.CSVWriter;
import com.northconcepts.datapipeline.group.GroupByReader;
import com.northconcepts.datapipeline.job.Job;
import com.northconcepts.datapipeline.transform.BasicFieldTransformer;
import com.northconcepts.datapipeline.transform.Ngrams;
import com.northconcepts.datapipeline.transform.TransformingReader;
import com.northconcepts.datapipeline.xml.XmlRecordReader;

public class ExtractBigramsTrigramsAndNgrams {
    
    private static final int NGRAMS = 3;  // bigram: 2; trigrams: 3; quadrigrams: 4;
    private static final int TOP_PHRASES = 25;
    
    private static final String[] URLS = {
            "https://rss.cbc.ca/lineup/topstories.xml",
            "https://rss.cbc.ca/lineup/world.xml",
            "https://rss.cbc.ca/lineup/canada.xml",
            "https://rss.cbc.ca/lineup/politics.xml",
            "https://rss.cbc.ca/lineup/business.xml",
            "https://rss.cbc.ca/lineup/health.xml",
            "https://rss.cbc.ca/lineup/arts.xml",
            "https://rss.cbc.ca/lineup/technology.xml",
            "https://rss.cbc.ca/lineup/offbeat.xml",
            "https://www.cbc.ca/cmlink/rss-cbcaboriginal",
            
            "https://globalnews.ca/feed/",
            "https://globalnews.ca/canada/feed/",
            "https://globalnews.ca/world/feed/",
            "https://globalnews.ca/politics/feed/",
            "https://globalnews.ca/money/feed/",
            "https://globalnews.ca/health/feed/",
            "https://globalnews.ca/entertainment/feed/",
            "https://globalnews.ca/environment/feed/",
            "https://globalnews.ca/tech/feed/",
            "https://globalnews.ca/sports/feed/",
            
            "https://www.ctvnews.ca/rss/ctvnews-ca-top-stories-public-rss-1.822009",
            "https://www.ctvnews.ca/rss/ctvnews-ca-canada-public-rss-1.822284",
            "https://www.ctvnews.ca/rss/ctvnews-ca-world-public-rss-1.822289",
            "https://www.ctvnews.ca/rss/ctvnews-ca-entertainment-public-rss-1.822292",
            "https://www.ctvnews.ca/rss/ctvnews-ca-politics-public-rss-1.822302",
            "https://www.ctvnews.ca/rss/lifestyle/ctv-news-lifestyle-1.3407722",
            "https://www.ctvnews.ca/rss/business/ctv-news-business-headlines-1.867648",
            "https://www.ctvnews.ca/rss/ctvnews-ca-sci-tech-public-rss-1.822295",
            "https://www.ctvnews.ca/rss/sports/ctv-news-sports-1.3407726",
            "https://www.ctvnews.ca/rss/ctvnews-ca-health-public-rss-1.822299",
            "https://www.ctvnews.ca/rss/autos/ctv-news-autos-1.867636",
            };

    public static void main(String[] args) throws Throwable {
        
        SequenceReader sequenceReader = new SequenceReader();
        
        for (String url : URLS) {
            BufferedReader input = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "UTF-8"));
            sequenceReader.add(new XmlRecordReader(input).addRecordBreak("/rss/channel/item"));
        }
        
        DataReader reader = sequenceReader;
        
        reader = new TransformingReader(reader)
                .add(new BasicFieldTransformer("title").lowerCase())
                .add(new Ngrams("title", "phrase", NGRAMS));
        
        reader = new GroupByReader(reader, "phrase")
                .setExcludeNulls(true)
                .count("count", true);
        
        reader = new SortingReader(reader).desc("count").asc("phrase");
        
        reader = new LimitReader(reader, TOP_PHRASES);
        
        DataWriter writer = new CSVWriter(new OutputStreamWriter(System.out))
                .setFieldNamesInFirstRow(true);
   
        Job.run(reader, writer);
        
    }
    
}
Mobile Analytics