Mar 092010
 

I have always believed that when it comes to blogging, brevity is soul and its very important to convey your idea in as less words as possible. Of course with technical blogs its an altogether different ball game with most of the visitors coming through search engines and looking for specific topics, where you can be afford to be verbose. But with a mixed audience and a variety of blog subjects, it is extremely important to keep it as short and simple as possible to ensure maximum reach.

To prove this point, I set out to find if there is a relation between the length of each post and the comments it receives. Though, the number of comments is not the best yardstick to measure the popularity of the post, it certainly is the most concrete one. The best place to try this out seemed to be my organization’s internal blogosphere, which had a lot of sample data (3500+ posts). Since its impossible to manually collect data for that many posts, I wrote a program to write it to a text file from where any correlation could be identified. Its a mixture of retrieving the data through HTML and RSS and parsing it. Though it should be compatible with any WordPress version, the comment counting function might need some tweaking to make it work on later versions. Once the code finishes executing, you would have a text file with the name of each post, the length and the number of comments. The data in this file can be imported into Excel for further manipulation.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
using System;
using System.Xml;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Windows.Forms;
 
public class TestRSS
{
    StreamWriter _swObj = new StreamWriter("Results.txt", true);
    public const string _feedURL = "http://blog.ganeshran.com";
 
    public static void Main()
    {
        new  TestRSS().RunTests();
    }
 
    public void RunTests()
    {
        for (int i = 1; ; i++) //This is an infinite loop only to be broken when there are no more posts
        {
            XmlDocument _xdoc = new XmlDocument(); //a New XmlDocument object
            //Lets load the XML from the url
            _xdoc.LoadXml(GetXmlData(_feedURL + (i>1?"?paged=" + i+"&":"?" )+"feed=rss2"));
            XmlNodeList _xList = _xdoc.GetElementsByTagName("item");
            if (_xList.Count == 0)
                break; //This means there are no more blog posts
            foreach (XmlNode _tempNode in _xList)
                //This method writes the name of post, length and the number of comments delimited by a colon
                //So we need to remove the colon in case of any being present/
                //Example Data: - ASP.NET Evolution WebForms v/s MVC: 6060:0
                WriteToFile(_tempNode.FirstChild.InnerText.Replace(":", "") + ": " + _tempNode.SelectSingleNode("description").NextSibling.InnerText.Length + ":" + GetComments(_tempNode.FirstChild.NextSibling.InnerText));
        }
    }
 
    public string GetXmlData(string _url)
    {
        //An ordinary retrieval of data from the url using the HttpWebRequest
        //This is a workaround for sites with badly formed RSS feeds due to script tags.
        //Once we get the data, we can load it into an XmlDocument class
        HttpWebRequest _blogReq = (HttpWebRequest)WebRequest.Create(_url);
        HttpWebResponse _blogResp = (HttpWebResponse)_blogReq.GetResponse();
        StreamReader _respStream = new StreamReader(_blogResp.GetResponseStream());
        //Replace Script tags
        return Regex.Replace(_respStream.ReadToEnd(), @"<script[^>]*?>[\s\S]*?<\/script>", "");
    }
 
    public int GetComments(string _url)
    {
        //Retrieving the Comments from the HTML and not the RSS feed. I wasnt able
        //to find a workaround for 10 comment limit in the RSS feed in wordpress 2.3,
        //Hence retriving the HTML and matching the comment divs. Keep in mind this wont work
        //for higher Wordpress versions. Just take the comment tag and substitute accordingly
        Match _match = Regex.Match(GetXmlData(_url), "<div class=\"mycomment\"[^>]*?>[\\s\\S]*?<\\/div>");
        int _counter=0;
        for (; _match.Success; _counter++)
            _match = _match.NextMatch();
        return _counter;
    }
 
    public void WriteToFile(string _tobeWritten)
    {
        //Just write it to the file
        _swObj.WriteLine(_tobeWritten);
        _swObj.Flush();
    }
}