// ==UserScript==
// @name          Inline Ad Stripper
// @namespace     http://arantius.com/misc/greasemonkey/
// @description	  Strip ads directly in the page right out!
// @include       *
// @exclude       file://*
// ==/UserScript==

// \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ //

const DEBUG=0;
const tld=getTld(document.location.href);
const b=document.body;
if ('undefined'==typeof b || 'undefined'==typeof b.scrollWidth) return;
const bSize=(b.scrollWidth*b.scrollHeight);
const adMaxSize=Math.max(Math.round(bSize/5), 150000);
const adTagNames={'TABLE':1, 'TR':1, 'TD':1, 'DIV':1, 'LI':1, 'UL':1, 'FIELDSET':1};
const maxInTldLinks=3;

const MAX_TIPOFF_LEN=200;
const MAX_AD_LEN=600;

// \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ //

//find "real" text content, HTML, minus script+noscript, minus tags
function strippedTextContent(el) {
	var text=el.innerHTML || el.textContent;
	if (!text) return '';

	text=text.replace(/[\s]+/g, ' '); //collapse whitespace
	text=text.replace(/<script.*?\/script>/gi, ''); //strip js
	text=text.replace(/<noscript.*?\/noscript>/gi, ''); //strip no-js
	text=text.replace(/<iframe.*?\/iframe>/gi, ''); // iframe, alternate content
	text=text.replace(/<!--.*?-->/gi, ''); //strip comments
	text=text.replace(/<\/?[^>]+>/gi, ''); //strip tags
	text=text.replace(/^\s+/, '');
	text=text.replace(/\s+$/, '');

	return text;
}

// \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ //

// find the TLD of an URL
function getTld(url) {
	// this terribly simple method seems to work good enough
	var host=url.replace(/[a-z]+:\/\/([^\/]+)\/.*/, '$1');
	return host.replace(/.*\.(.*......)/, '$1')
}

// \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ //

// given a tipoff from below, climb up the tree to find the biggest container
// that probably contains the ad, and probably contains nothing else
function tipoffToAd(tip) {
	var cont=tip, ad=null;

	// climb up the tree to try to find the element that contains the whole ad
	while (cont) {
		//if it's too big
		if ((cont.scrollWidth*cont.scrollHeight) > adMaxSize) {
			if (DEBUG>1) console.log(
				'stopping climb because this one is too big',
				cont, (cont.scrollWidth*cont.scrollHeight), '>', adMaxSize
			);
			//ad=null;
			break;
		}

		// if THIS is a link to this same domain, forget it
		if ('A'==cont.tagName && tld==getTld(cont.href)) {
			if (DEBUG>1) console.log(
				'stopping climb because THIS is in-tld link',
				tld, getTld(cont.href)
			);
			break;
		}

		// if the link(s) in it are to this domain, no
		var contLinks=cont.getElementsByTagName('a');
		var inTldLinks=0;
		if (contLinks.length>1) {
			for (var i=0, link=null; link=contLinks[i]; i++) {
				if (tld==getTld(contLinks[i].href)) {
					inTldLinks++;
				}

				if (inTldLinks>maxInTldLinks) {
					if (DEBUG>1) console.log(
						'stopping climb because of in-tld links inside',
						inTldLinks, 'max', maxInTldLinks
					);
					break;
				}
			}
			if (DEBUG>1) console.log('in tld links', inTldLinks);
		}

		// if we've stripped it before, no
		if (cont.inline_stripped) {
			if (DEBUG>1) console.log('skipping because stripped earlier');
			ad=null;
			break;
		}

		// if it was inside a <select> then it's an inappropriate target to strip
		if ('SELECT'==cont.tagName) {
			if (DEBUG>1) console.log('skipping because we were inside SELECT', cont);
			ad=null;
			break;
		}

		// final resort, make sure we don't match too much text
		if (strippedTextContent(cont).length > MAX_AD_LEN) {
			if (DEBUG>1) console.log('stopping climb because too much text', cont);
			break;
		}

		// if we got here, and it's the right kind, mark container as ad
		if (cont.tagName in adTagNames) ad=cont;

		// we didn't stop above, so climb
		cont=cont.parentNode;

		if (DEBUG>1) console.debug('about to start loop with', cont);
	}

	return ad;
}

// \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ // \\ //

function stripAd(foundEl, adEl) {
	if (DEBUG>1) console.log(
		'to strip tip:', foundEl, 'ad:', adEl,
		ad.scrollWidth+'x'+ad.scrollHeight+' = '+(ad.scrollWidth*ad.scrollHeight)+' <= '+adMaxSize
	);

	// in case one piece matched more than once
	if (adEl.inline_stripped) return;

	adEl.inline_stripped=true;

	if (DEBUG) {
		if (foundEl) {
			//point out the found word
			foundEl.style.color='green';
			foundEl.style.backgroundColor='red';
		}

		try {
			adEl.style.outline='2px dotted red';
			//adEl.style.backgroundColor='orange';
			adEl.style.opacity=0.5;
		} catch (e) { }
	} else {
		adEl.style.display='none';
	}
}

///////////////////////////// Based on Attributes //////////////////////////////

var attrTipoffs=[
	'a44', 'pad9px_right', // these two for demonoid
	'Ad', 'ad', 'ad-splash', 'Ad120x600', 'Ad336x280', 'Ad728x90', 'ad_box',
	'ad_featurebar', 'adBannerTable', 'adbar', 'adContainer', 'adhere',
	'adright', 'adtop', 'advert', 'advertisement', 'Banner', 'banner_ad',
	'bannerAd', 'bannerad', 'cnnContextualLinksBox', 'contextualLinks',
	'gridBannerRowCell', 'leaderboardAd', 'main_ad', 'RadAd_Banner',
	'sponLinkDiv_1', 'sponsored', 'TopAdCenter'
];

var ads=document.evaluate(
	"//*[@alt='"+attrTipoffs.join("' or @alt='")+"'] | "+
	"//*[@class='"+attrTipoffs.join("' or @class='")+"'] | "+
	"//*[@id='"+attrTipoffs.join("' or @id='")+"']",
	document, null, XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null
);
for (var i=0, ad=null; ad=ads.snapshotItem(i); i++) {
	if (DEBUG) console.info('attr tipped off:', ad);
	stripAd(null, ad);
}


////////////////////////////// Based on Content ////////////////////////////////

var contentTipoffs=[
	'ADVERTIS', 'Advertis', 'advertis', 'Ad Link', 'Ads by', 'Ads',
	'SPONSOR', 'Sponsor', 'sponsor',
	'Your Ad Here'
];

var ad;
var ads=document.evaluate(
	"//text()[contains(., '"+contentTipoffs.join("') or contains(., '")+"')]/.. | " +
	"//img[contains(@src, '"+contentTipoffs.join("') or contains(@src, '")+"')]",
	document, null,	XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null
);
for (var i=0, tip=null; tip=ads.snapshotItem(i); i++) {
	// make sure the intital tipoff wasn't part of a long paragraph
	if (strippedTextContent(tip).length > MAX_TIPOFF_LEN) continue;
	// don't do <script> content tipoffs
	if ('SCRIPT'==tip.tagName) continue;

	if (DEBUG) console.info('content tipped off:', tip);
	ad=tipoffToAd(tip);
	if (DEBUG>1) console.log('content loop ended with ad:', ad, 'tip:', tip);
	if (ad) stripAd(tip, ad);
}

//////////////////// Tables (or divs), based on ad scripts /////////////////////

// this section is to combat the oh-so-common practice of smashing a series
// of images next to an ad block .. AdBlock Plus gets those ads, but misses
// the pictures next to them, leaving wasted blank space.

var scriptTipoffs=[
	'http://pagead2.googlesyndication.com/',
	'http://ypn-js.overture.com/',
	'http://ad'
];

var ad;
var scripts=document.evaluate(
	"//script[starts-with(@src, '"+scriptTipoffs.join("') or starts-with(@src, '")+"')]",
	document, null, XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null
);
for (var i=0, script=null; script=scripts.snapshotItem(i); i++) {
	if (DEBUG) console.info('script tipped off:', script);
	ad=tipoffToAd(script.parentNode);
	if (ad && ad.getElementsByTagName('img').length>2) {
		stripAd(null, ad);
	} else {
		// If this is just a div (with padding, still taking up space?) holding
		// adsense, make sure it's gone.
		if ('DIV'==script.parentNode.tagName && 
			0==strippedTextContent(script.parentNode).length
		) {
			stripAd(script, script.parentNode);
		}
	}
}
